comparison microsatbed.xml @ 0:dee16d45a2a9 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/microsatbed commit 275acb787c01484c6e435c8864090d377c3fde75
author iuc
date Sun, 21 Jul 2024 07:18:47 +0000
parents
children 1a0ea94317a9
comparison
equal deleted inserted replaced
-1:000000000000 0:dee16d45a2a9
1
2 <tool id="microsatbed" name="STR to bed" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
3 <description>Short Tandem Repeats to bed features from fasta</description>
4 <macros>
5 <token name="@TOOL_VERSION@">1.3.2</token>
6 <token name="@VERSION_SUFFIX@">0</token>
7 <token name="@PYFASTX_VERSION@">2.1.0</token>
8 <token name="@PYTHON_VERSION@">3.12.3</token>
9 <token name="@UCSC_VERSION@">455</token>
10 <macro name="subsetmacro">
11 <param name="subset" type="select" label="Select at least 1 specific motif length to report" help="Bed features will be output for every motif of the selected length(s) with the minimum required repeats or more" multiple="true">
12 <option value="--di" selected="true">All dimers (AC,AG,AT,...)</option>
13 <option value="--tri">All trimers (ACG,..)</option>
14 <option value="--tetra">All tetramers (ACGT,..)</option>
15 <option value="--penta">All pentamers (ACGTC,..)</option>
16 <option value="--hexa">All hexamers (ACGTCG,..)</option>
17 <option value="--mono">All monomers (A,C...). Warning! Can produce overwhelming numbers of bed features</option>
18 </param>
19 </macro>
20 </macros>
21 <requirements>
22 <requirement version="@PYTHON_VERSION@" type="package">python</requirement>
23 <requirement version="@PYFASTX_VERSION@" type="package">pyfastx</requirement>
24 <requirement version="@TOOL_VERSION@" type="package">pytrf</requirement>
25 <requirement version="@UCSC_VERSION@" type="package">ucsc-bedgraphtobigwig</requirement>
26 </requirements>
27 <required_files>
28 <include path="find_str.py"/>
29 </required_files>
30 <version_command><![CDATA[python -c "import pytrf; from importlib.metadata import version; print(version('pytrf'))"]]></version_command>
31 <command><![CDATA[
32 #if $mode_cond.mode == "NATIVE":
33 #if $reference_genome.genome_type_select == "history":
34 pytrf findstr -f '$mode_cond.outformat' -o '$bed' -r '$monomin' '$dimin' '$trimin' '$tetramin' '$pentamin' '$hexamin' '${reference_genome.fasta}'
35 #else:
36 pytrf findstr -f '$mode_cond.outformat' -o '$bed' -r '$monomin' '$dimin' '$trimin' '$tetramin' '$pentamin' '$hexamin' '${reference_genome.fasta.fields.path}'
37 #end if
38 #else:
39 python '${__tool_directory__}/find_str.py'
40 #if $reference_genome.genome_type_select == "history":
41 --fasta '${reference_genome.fasta}'
42 #else:
43 --fasta '${reference_genome.fasta.fields.path}'
44 #end if
45 --bed '$bed'
46 #if $mode_cond.mode == "SPECIFIC":
47 --specific '$mode_cond.specific'
48 #elif $mode_cond.mode == "SPECIFICBW":
49 --bigwig
50 --winwidth '$mode_cond.winwidth'
51 --specific '$mode_cond.specific'
52 #else:
53 #for $flag in $mode_cond.subset:
54 $flag
55 #end for
56 #end if
57 --monomin '$monomin'
58 --dimin '$dimin'
59 --trimin '$trimin'
60 --tetramin '$tetramin'
61 --pentamin '$pentamin'
62 --hexamin '$hexamin'
63 #if $mode_cond.mode == "SPECIFICBW":
64 --bigwig
65 --winwidth '$mode_cond.winwidth'
66 #end if
67 #end if
68 ]]></command>
69 <inputs>
70 <conditional name="reference_genome">
71 <param name="genome_type_select" type="select" label="Select a source for fasta sequences to be searched for STRs" help="Options are to choose a built-in genome, or choose any history fasta file">
72 <option value="indexed">Use a Galaxy server built-in reference genome fasta</option>
73 <option value="history" selected="True">Use any fasta file from the current history</option>
74 </param>
75 <when value="indexed">
76 <param name="fasta" type="select" label="Choose a built-in genome" help="If the genome you need is not on the list, upload it and select it as a current history fasta" >
77 <options from_data_table="all_fasta"/>
78 </param>
79 </when>
80 <when value="history">
81 <param name="fasta" type="data" format="fasta,fasta.gz" label="Choose a fasta file from the current history" />
82 </when>
83 </conditional>
84 <conditional name="mode_cond">
85 <param name="mode" type="select" label="Select patterns by motif length; or provide a specific motif pattern to report?" help="Choose *By length:* or *By pattern:* to configure STR selection mode">
86 <option selected="True" value="ALL">By length: Report all motifs of one or more specified lengths (1-6nt) as bed features</option>
87 <option value="ALLBW">By length as windowed bigwig: Report all motifs of one or more specified lengths (1-6nt) as windowed density</option>
88 <option value="SPECIFIC">By motif: Report one or more specific motifs (such as TCA,GC) as bed features</option>
89 <option value="SPECIFICBW">By motif as windowed bigwig: Report one or more specific motifs (such as TCA,GC) as windowed density</option>
90 <option value="NATIVE">All exact STR: use the pytrf findstr native command to a create csv, tsv or gtf output</option>
91 </param>
92 <when value="ALL">
93 <expand macro="subsetmacro"/>
94 </when>
95 <when value="ALLBW">
96 <expand macro="subsetmacro"/>
97 <param name="winwidth" type="integer" min="5" value="128" label="Window with for estimating STR bigwig density"/>
98 </when>
99 <when value="SPECIFIC">
100 <param name="specific" type="text" label="Supply a specific motif pattern. Separate multiple patterns with commas such as GA,GC" help="Make bed features only for the nominated specific motifs."/>
101 </when>
102 <when value="SPECIFICBW">
103 <param name="specific" type="text" label="Supply a specific motif pattern. Separate multiple patterns with commas such as GA,GC" help="Make bed features only for the nominated specific motifs."/>
104 <param name="winwidth" type="integer" min="5" value="128" label="Window with for estimating STR bigwig density"/>
105 </when>
106 <when value="NATIVE">
107 <param name="outformat" type="select" label="Select the required output format" help="Pytrf can create GFF, CSV or TSV output files. Documentation is linked in the help section below">
108 <option value="gff">GFF</option>
109 <option value="csv">Comma separated values</option>
110 <option value="tsv" selected="true">Tab separated values</option>
111 </param>
112 </when>
113 </conditional>
114 <param name="monomin" type="integer" min="2" value="10" label="Minimum repeats required for monomers"/>
115 <param name="dimin" type="integer" min="1" value="3" label="Minimum repeats required for dimers"/>
116 <param name="trimin" type="integer" min="2" value="2" label="Minimum repeats required for trimers"/>
117 <param name="tetramin" type="integer" min="2" value="2" label="Minimum repeats required for tetramers"/>
118 <param name="pentamin" type="integer" min="2" value="2" label="Minimum repeats required for pentamers"/>
119 <param name="hexamin" type="integer" min="2" value="2" label="Minimum repeats required for hexamers"/>
120 </inputs>
121 <outputs>
122 <data name="bed" format="bed" label="STR from $fasta.element_identifier">
123 <change_format>
124 <when input="mode_cond.outformat" value="gff" format="gff"/>
125 <when input="mode_cond.outformat" value="csv" format="csv"/>
126 <when input="mode_cond.outformat" value="tsv" format="tabular"/>
127 <when input="mode_cond.mode" value="ALLBW" format="bigwig"/>
128 <when input="mode_cond.mode" value="SPECIFICBW" format="bigwig"/>
129 </change_format>
130 </data>
131 </outputs>
132 <tests>
133 <test expect_num_outputs="1">
134 <conditional name="reference_genome">
135 <param name="genome_type_select" value="history"/>
136 <param name="fasta" value="humsamp.fa"/>
137 </conditional>
138 <conditional name="mode_cond">
139 <param name="mode" value="ALL"/>
140 <param name="subset" value="--di,--tri,--tetra,--penta,--hexa"/>
141 </conditional>
142 <param name="monomin" value="20"/>
143 <param name="dimin" value="20"/>
144 <param name="trimin" value="5"/>
145 <param name="tetramin" value="5"/>
146 <param name="pentamin" value="3"/>
147 <param name="hexamin" value="2"/>
148 <output name="bed" value="bed_sample" compare="diff" lines_diff="0">
149 <assert_contents>
150 <has_n_columns n="5"/>
151 <has_text text="hpat1"/>
152 <has_text text="CCCCAC_2"/>
153 <has_text text="TTTTTT_2"/>
154 </assert_contents>
155 </output>
156 </test>
157 <test expect_num_outputs="1">
158 <conditional name="reference_genome">
159 <param name="genome_type_select" value="history"/>
160 <param name="fasta" value="humsamp.fa"/>
161 </conditional>
162 <conditional name="mode_cond">
163 <param name="mode" value="SPECIFIC"/>
164 <param name="specific" value="GC"/>
165 </conditional>
166 <param name="monomin" value="20"/>
167 <param name="dimin" value="1"/>
168 <param name="trimin" value="20"/>
169 <param name="tetramin" value="20"/>
170 <param name="pentamin" value="20"/>
171 <param name="hexamin" value="20"/>
172 <output name="bed" value="dibed_sample" compare="diff" lines_diff="0">
173 <assert_contents>
174 <has_n_columns n="5"/>
175 <has_text text="hpat1"/>
176 <has_text text="GC_1"/>
177 <not_has_text text="TC_1"/>
178 <has_text text="209316"/>
179 </assert_contents>
180 </output>
181 </test>
182 <test expect_num_outputs="1">
183 <conditional name="reference_genome">
184 <param name="genome_type_select" value="history"/>
185 <param name="fasta" value="mouse.fa"/>
186 </conditional>
187 <conditional name="mode_cond">
188 <param name="mode" value="NATIVE"/>
189 <param name="outformat" value="gff"/>
190 </conditional>
191 <param name="monomin" value="20"/>
192 <param name="dimin" value="10"/>
193 <param name="trimin" value="5"/>
194 <param name="tetramin" value="4"/>
195 <param name="pentamin" value="4"/>
196 <param name="hexamin" value="2"/>
197 <output name="bed" value="nativegff_sample" compare="diff" lines_diff="0">
198 <assert_contents>
199 <has_n_columns n="9"/>
200 <has_text text="Motif=CCGCCG;Type=6;Repeat=2;Length=12"/>
201 <has_text text="mm10_knownGene_uc008xda.1"/>
202 <has_text text="Motif=AGAGAG;Type=6;Repeat=2;Length=12"/>
203 </assert_contents>
204 </output>
205 </test>
206 <test expect_num_outputs="1">
207 <conditional name="reference_genome">
208 <param name="genome_type_select" value="history"/>
209 <param name="fasta" value="humsamp.fa"/>
210 </conditional>
211 <conditional name="mode_cond">
212 <param name="mode" value="SPECIFICBW"/>
213 <param name="specific" value="GC"/>
214 </conditional>
215 <param name="monomin" value="20"/>
216 <param name="dimin" value="1"/>
217 <param name="trimin" value="20"/>
218 <param name="tetramin" value="20"/>
219 <param name="pentamin" value="20"/>
220 <param name="hexamin" value="20"/>
221 <output name="bed" value="dibed_wig_sample" compare="sim_size" delta="10">
222 <assert_contents>
223 <has_size value="73544" delta="10" />
224 </assert_contents>
225 </output>
226 </test>
227 </tests>
228 <help><![CDATA[
229
230 **Convert short repetitive sequences to bed features or windowed density bigwigs**
231
232 Microsatellites are usually defined as repeated short DNA patterns in an unbroken sequence.
233 A microsatellite pattern or *motif* can be any combination nucleotides, typically from 1 to 6nt in length.
234
235 This tool allows microsatellite and related features to be selected from a fasta sequence input file, and output into a track, suitable for viewing in a genome browser such as JBrowse2.
236
237 All motifs of selected lengths can be reported as individual features in the output bed file, or specific motifs can be provided and all
238 others will be ignored. In all cases, a minimum required number of repeats can be specified. For example, requiring 2 or more repeats of the trimer *ACG* will report
239 every sequence of *ACGACG* or *ACGACGACG* or *ACGACGACGACG* and so on, as individual bed features. Similarly, requiring 3 repeats of any trimer will
240 report every distinct 3 nucleotide pattern, including *ACGACGACG* as well as every other unique 3 nucleotide pattern with 3 sequential repeats or more such, as "CTCCTCCTC*.
241
242 For other output formats, the pytrf native command line *findstr* can be used to produce a gff, csv or tsv output containing all exact short tandem repeats, as
243 described at the end of https://pytrf.readthedocs.io/en/latest
244
245 A fasta file must be supplied for processing. A built in genome can be selected, or a fasta file of any kind can be selected from the current history. Note that all
246 symbols are treated as valid nucleotides by pytrf, so extraneous characters such as *-* or *N* in the input fasta may appear as unexpected bed features. Lower case fasta symbols will be converted
247 to uppercase, to prevent them being reported as distinct motifs.
248
249 Output can be bed format, or for two kinds of operation, a bigwig track showing bases covered by selected features over a configurable window size with a default of 128nt.
250
251 **Select motifs by length - for bed or windowed density bigwig**
252
253 The default tool form setting is to select all dimer motif patterns.
254
255 Any combination of motif lengths from 1 to 6nt can be selected in the multiple-select drop-down list. All features will be returned in a single bed file. For each selected motif length,
256 the minimum number of repeats required for reporting can be adjusted. **Tandem repeats** are defined as at least 2 of any pattern. This tool allows singleton dimer motifs to be reported,
257 so is not restricted to short tandem repeats (STR)
258
259 This mode of operation can produce a bed file with every STR as a separate feature.
260 These can be very large and a bigwig containing the sum of STR bases over a selectable window size (default 128) may be more
261 useful and much faster to load.
262
263 **Select motifs by pattern - for bed or windowed density bigwig**
264
265 This option allows a motif pattern to be specified as a text string such as *CG* or *ATC*. Multiple motifs can be specified as a comma separated string such as *CG,ATC*.
266 All features will be returned as a single bed file.
267
268 The minimum number of repeats for all motifs can be set to match specific requirements.
269
270 For example, technical sequencing read bias may be influenced by the density of specific dimers, whether they are repeated or not
271 such as in https://github.com/arangrhie/T2T-Polish/tree/master/pattern
272
273 This mode of operation can produce a bed file with every STR as a separate feature.
274 These can be very large and a bigwig containing the sum of STR bases over a selectable window size (default 128) may be more
275 useful and much faster to load.
276
277 **Select all perfect STR using pytrf findstr in csv, tsv or gff output format**
278
279 This selection runs the pytrf *findstr* option to create gff/csv/tsv outputs as described at the end of https://pytrf.readthedocs.io/en/latest/.
280
281 Quoted here:
282
283 *A Tandem repeat (TR) in genomic sequence is a set of adjacent short DNA sequence repeated consecutively. The core sequence or repeat unit is generally called motif.
284 According to the motif length, tandem repeats can be classified as microsatellites and minisatellites. Microsatellites are also known as simple sequence repeats (SSRs)
285 or short tandem repeats (STRs) with motif length of 1-6 bp. Minisatellites are also sometimes referred to as variable number of tandem repeats (VNTRs) has longer motif length than microsatellites.
286 Pytrf is a lightweight Python C extension for identification of tandem repeats. The pytrf enables to fastly identify both exact or perfect SSRs.
287 It also can find generic tandem repeats with any size of motif, such as with maximum motif length of 100 bp. Additionally, it has capability of finding approximate or imperfect tandem repeats*
288
289 ]]></help>
290 <citations>
291 <citation type="bibtex">@misc{pytrf,
292 title = {{pytrf} Short tandem repeat finder, Accessed on July 10 2024},
293 howpublished = {\url{https://github.com/lmdu/pytrf}},
294 note = {Accessed on July 10 2024}
295 }</citation>
296 </citations>
297 </tool>