0
|
1 <?xml version="1.0"?>
|
11
|
2 <tool id="genetrack" name="GeneTrack" version="@WRAPPER_VERSION@.0">
|
0
|
3 <description>peak predictor</description>
|
|
4 <macros>
|
|
5 <import>genetrack_macros.xml</import>
|
|
6 </macros>
|
|
7 <expand macro="requirements" />
|
|
8 <command>
|
|
9 python $__tool_directory__/genetrack.py
|
|
10 --input_format $input_format_cond.input_format
|
6
|
11 #if str($input_format_cond.input_format) == "scidx":
|
|
12 #for $i in $input_format_cond.input_scidx:
|
0
|
13 --input "${i}" "${i.hid}"
|
|
14 #end for
|
|
15 #elif str($input_format_cond.input_format) == "gff":
|
|
16 #for $i in $input_format_cond.input_gff:
|
|
17 --input "${i}" "${i.hid}"
|
|
18 #end for
|
|
19 #end if
|
|
20 --sigma $sigma
|
|
21 --exclusion $exclusion
|
|
22 --up_width $up_width
|
|
23 --down_width $down_width
|
|
24 --filter $filter
|
|
25 </command>
|
|
26 <inputs>
|
|
27 <conditional name="input_format_cond">
|
|
28 <param name="input_format" type="select" label="Format of files for conversion">
|
6
|
29 <option value="scidx" selected="True">ScIdx</option>
|
|
30 <option value="gff">Gff</option>
|
0
|
31 </param>
|
6
|
32 <when value="scidx">
|
|
33 <param name="input_scidx" type="data" format="scidx" multiple="True" label="Predict peaks on" />
|
0
|
34 </when>
|
|
35 <when value="gff">
|
|
36 <param name="input_gff" type="data" format="gff" multiple="True" label="Predict peaks on" />
|
|
37 </when>
|
|
38 </conditional>
|
|
39 <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads" help="Higher values increase computation but produce more smoothing." />
|
|
40 <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
|
12
|
41 <param name="up_width" type="integer" value="10" min="0" label="Exclusion zone of upstream called peaks" />
|
|
42 <param name="down_width" type="integer" value="10" min="0" label="Exclusion zone of downstream called peaks" />
|
15
|
43 <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
|
0
|
44 </inputs>
|
|
45 <outputs>
|
|
46 <collection name="genetrack_output" type="list" label="Genetrack results on ${on_string}">
|
|
47 <discover_datasets pattern="(?P<designation>.*)" directory="output" ext="gff" visible="false" />
|
|
48 </collection>
|
|
49 </outputs>
|
|
50 <tests>
|
|
51 <test>
|
|
52 <param name="input_gff" value="genetrack_input2.gff" ftype="gff" />
|
|
53 <param name="input_format" value="gff" />
|
|
54 <param name="sigma" value="5" />
|
|
55 <param name="exclusion" value="20" />
|
|
56 <param name="up_width" value="10" />
|
|
57 <param name="down_width" value="10" />
|
|
58 <param name="filter" value="3" />
|
|
59 <output_collection name="genetrack_output" type="list">
|
|
60 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output2.gff" ftype="gff" />
|
|
61 </output_collection>
|
|
62 </test>
|
|
63 <test>
|
6
|
64 <param name="input_scidx" value="genetrack_input3.scidx" ftype="scidx" />
|
|
65 <param name="input_format" value="scidx" />
|
0
|
66 <param name="sigma" value="5" />
|
|
67 <param name="exclusion" value="20" />
|
|
68 <param name="up_width" value="10" />
|
|
69 <param name="down_width" value="10" />
|
|
70 <param name="filter" value="3" />
|
|
71 <output_collection name="genetrack_output" type="list">
|
|
72 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output3.gff" ftype="gff" />
|
|
73 </output_collection>
|
|
74 </test>
|
|
75 <test>
|
|
76 <param name="input_gff" value="genetrack_input_unsorted4.gff" ftype="gff" />
|
|
77 <param name="input_format" value="gff" />
|
|
78 <param name="sigma" value="5" />
|
|
79 <param name="exclusion" value="20" />
|
|
80 <param name="up_width" value="10" />
|
|
81 <param name="down_width" value="10" />
|
|
82 <param name="filter" value="3" />
|
|
83 <output_collection name="genetrack_output" type="list">
|
|
84 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output4.gff" ftype="gff" />
|
|
85 </output_collection>
|
|
86 </test>
|
|
87 </tests>
|
|
88 <help>
|
|
89 **What it does**
|
|
90
|
15
|
91 GeneTrack separately identifies peaks on the forward "+” (W) and reverse “-” (C) strand. The way that GeneTrack
|
|
92 works is to replace each tag with a probabilistic distribution of occurrences for that tag at and around its mapped
|
12
|
93 genomic coordinate. The distance decay of the probabilistic distribution is set by adjusting the value of the
|
|
94 tool's **Sigma to use when smoothing reads** parameter. GeneTrack then sums the distribution over all mapped
|
|
95 tags. This results in a smooth continuous trace that can be globally broadened or tightened by adjusting the
|
|
96 sigma value. GeneTrack starts with the highest smoothed peak first, treating each strand separately if indicated
|
|
97 by the data, then sets up an exclusion zone (centered over the peak) defined by the value of the **Peak exclusion
|
|
98 zone** parameter (see figure). The exclusion zone prevents any secondary peaks from being called on the same strand
|
|
99 within that exclusion zone. In rare cases, it may be desirable to set different exclusion zones upstream (more 5’)
|
|
100 versus downstream (more 3’) of the peak.
|
|
101
|
15
|
102 .. image:: $PATH_TO_IMAGES/genetrack.png
|
|
103
|
|
104 GeneTrack continues through the data in order of peak height, until no other peaks are found, and in principle will
|
|
105 call a peak at a single isolated tag, if no filter is set using the tool's **Absolute read filter** parameter. A
|
|
106 filter value of 1 means that it will stop calling peaks when the tag count in the peak hits 1 (so single tag peaks
|
|
107 will be excluded in this case). GeneTrack outputs **chrom** (chromosome number), **strand** (+/W or -/C strand),
|
|
108 **start** (lower coordinate of exclusion zone), **end** (higher coordinate of exclusion zone), and **value** (peak
|
|
109 height). Genetrack's GFF output reports the start (lower coordinate) and end (higher coordinate) of the exclusion
|
|
110 zone.
|
|
111
|
|
112 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein plus
|
|
113 a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
|
|
114 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
|
12
|
115
|
15
|
116 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding
|
|
117 site size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors
|
|
118 mapped by ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically
|
|
119 varied between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be
|
|
120 desirable if closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that
|
|
121 contribute to a binding event to be excluded, because they may not be located sufficiently close to the main peak.
|
|
122 If alternative (mutually exclusive) binding is expected for two overlapping sites, and these sites are to be
|
|
123 independently recorded, then an empirically determined smaller exclusion zone width is set. Thus the value of sigma
|
|
124 is set empirically for each mapped factor, depending upon the resolution and binding site size of the binding event.
|
|
125
|
|
126 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on
|
|
127 only a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might
|
|
128 be improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized
|
|
129 action of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
|
0
|
130
|
6
|
131 -----
|
|
132
|
0
|
133 **Options**
|
|
134
|
12
|
135 * **Sigma to use when smoothing reads** - Smooths clusters of tags via a Gaussian distribution.
|
|
136 * **Peak exclusion zone** - Exclusion zone around each peak, eliminating all other peaks on the same strand that are within a ± bp distance of the peak.
|
|
137 * **Exclusion zone of upstream called peaks** - Defines the exclusion zone centered over peaks upstream of a peak.
|
|
138 * **Exclusion zone of downstream called peaks** - Defines the exclusion zone centered over peaks downstream of a peak.
|
|
139 * **Filter** - Absolute read filter, restricts output to only peaks with larger peak height.
|
0
|
140 </help>
|
|
141 <expand macro="citations" />
|
|
142 </tool>
|