14
|
1 <tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.2">
|
0
|
2 <description>using coordinates from assembled/unassembled genomes</description>
|
1
|
3 <requirements>
|
10
|
4 <requirement type="package" version="0.7.1">bx-python</requirement>
|
1
|
5 <requirement type="package" version="35x1">faToTwoBit</requirement>
|
|
6 </requirements>
|
0
|
7 <command>
|
|
8 <![CDATA[
|
1
|
9 #set genome = $input.metadata.dbkey
|
0
|
10 #set datatype = $input.datatype
|
|
11 mkdir -p output_dir &&
|
|
12 python $__tool_directory__/extract_genomic_dna.py
|
|
13 --input "$input"
|
1
|
14 --genome "$genome"
|
12
|
15 #if $input.is_of_type("gff"):
|
6
|
16 --input_format "gff"
|
|
17 --columns "1,4,5,7"
|
1
|
18 --interpret_features $interpret_features
|
0
|
19 #else:
|
6
|
20 --input_format "interval"
|
0
|
21 --columns "${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}"
|
|
22 #end if
|
|
23 --reference_genome_source $reference_genome_cond.reference_genome_source
|
|
24 #if str($reference_genome_cond.reference_genome_source) == "cached"
|
|
25 --reference_genome $reference_genome_cond.reference_genome.fields.path
|
|
26 #else:
|
|
27 --reference_genome $reference_genome_cond.reference_genome
|
|
28 #end if
|
14
|
29 --output_format $output_format_cond.output_format
|
|
30 #if str($output_format_cond.output_format) == "fasta":
|
18
|
31 --fasta_header_type $output_format_cond.fasta_header_type_cond.fasta_header_type
|
|
32 #if str($output_format_cond.fasta_header_type_cond.fasta_header_type) == "char_delimited":
|
|
33 --fasta_header_delimiter $output_format_cond.fasta_header_type_cond.fasta_header_delimiter
|
|
34 #end if
|
14
|
35 #end if
|
0
|
36 --output $output
|
|
37 ]]>
|
|
38 </command>
|
|
39 <inputs>
|
10
|
40 <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in">
|
1
|
41 <validator type="unspecified_build" />
|
|
42 </param>
|
10
|
43 <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is in the gff family">
|
1
|
44 <option value="yes">Yes</option>
|
|
45 <option value="no">No</option>
|
|
46 </param>
|
0
|
47 <conditional name="reference_genome_cond">
|
|
48 <param name="reference_genome_source" type="select" label="Choose the source for the reference genome">
|
|
49 <option value="cached">locally cached</option>
|
|
50 <option value="history">from history</option>
|
|
51 </param>
|
|
52 <when value="cached">
|
|
53 <param name="reference_genome" type="select" label="Using reference genome">
|
1
|
54 <options from_data_table="twobit">
|
|
55 <filter type="data_meta" key="dbkey" ref="input" column="0"/>
|
0
|
56 </options>
|
|
57 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
|
|
58 </param>
|
|
59 </when>
|
|
60 <when value="history">
|
|
61 <param name="reference_genome" type="data" format="fasta" label="Using reference genome">
|
|
62 <options>
|
1
|
63 <filter type="data_meta" key="dbkey" ref="input"/>
|
0
|
64 </options>
|
|
65 <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/>
|
|
66 </param>
|
|
67 </when>
|
|
68 </conditional>
|
14
|
69 <conditional name="output_format_cond">
|
|
70 <param name="output_format" type="select" label="Select output format">
|
|
71 <option value="fasta" selected="True">fasta</option>
|
|
72 <option value="interval">interval</option>
|
|
73 </param>
|
|
74 <when value="fasta">
|
18
|
75 <conditional name="fasta_header_type_cond">
|
|
76 <param name="fasta_header_type" type="select" label="Select fasta header format">
|
|
77 <option value="bedtools_getfasta_default" selected="True">bedtools getfasta default</option>
|
|
78 <option value="char_delimited">character delimited field values</option>
|
|
79 </param>
|
|
80 <when value="bedtools_getfasta_default"/>
|
|
81 <when value="char_delimited">
|
|
82 <param name="fasta_header_delimiter" type="select" label="Select fasta header field delimiter">
|
|
83 <option value="underscore" selected="True">underscore (_)</option>
|
|
84 <option value="semicolon">semicolon (;)</option>
|
|
85 <option value="comma">comma (,)</option>
|
|
86 <option value="tilda">tilda (~)</option>
|
|
87 <option value="vetical_bar">vertical bar (|)</option>
|
|
88 </param>
|
|
89 </when>
|
|
90 </conditional>
|
14
|
91 </when>
|
|
92 <when value="interval"/>
|
|
93 </conditional>
|
0
|
94 </inputs>
|
|
95 <outputs>
|
13
|
96 <data format_source="input" name="output" metadata_source="input">
|
0
|
97 <change_format>
|
14
|
98 <when input="output_format_cond.output_format" value="fasta" format="fasta" />
|
0
|
99 </change_format>
|
|
100 </data>
|
|
101 </outputs>
|
|
102 <tests>
|
|
103 <test>
|
|
104 <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
|
|
105 <param name="interpret_features" value="yes"/>
|
|
106 <param name="index_source" value="cached"/>
|
|
107 <param name="out_format" value="fasta"/>
|
18
|
108 <param name="fasta_header_type" value="char_delimited"/>
|
|
109 <param name="fasta_header_delimiter" value="underscore"/>
|
7
|
110 <output name="out_file1" file="extract_genomic_dna_out1.fasta" compare="contains" />
|
0
|
111 </test>
|
|
112 <test>
|
|
113 <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
|
|
114 <param name="interpret_features" value="yes"/>
|
|
115 <param name="index_source" value="cached"/>
|
|
116 <param name="out_format" value="fasta"/>
|
18
|
117 <param name="fasta_header_type" value="char_delimited"/>
|
|
118 <param name="fasta_header_delimiter" value="underscore"/>
|
7
|
119 <output name="out_file1" file="extract_genomic_dna_out2.fasta" compare="contains" />
|
0
|
120 </test>
|
|
121 <test>
|
|
122 <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
|
|
123 <param name="interpret_features" value="yes"/>
|
|
124 <param name="index_source" value="cached"/>
|
|
125 <param name="out_format" value="interval"/>
|
7
|
126 <output name="out_file1" file="extract_genomic_dna_out3.interval" compare="contains" />
|
0
|
127 </test>
|
|
128 <!-- Test GFF file support. -->
|
|
129 <test>
|
|
130 <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
|
|
131 <param name="interpret_features" value="no"/>
|
|
132 <param name="index_source" value="cached"/>
|
|
133 <param name="out_format" value="interval"/>
|
7
|
134 <output name="out_file1" file="extract_genomic_dna_out4.gff" compare="contains" />
|
0
|
135 </test>
|
|
136 <test>
|
|
137 <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
|
|
138 <param name="interpret_features" value="no"/>
|
14
|
139 <param name="index_source" value="cached"/>
|
0
|
140 <param name="out_format" value="fasta"/>
|
18
|
141 <param name="fasta_header_type" value="char_delimited"/>
|
|
142 <param name="fasta_header_delimiter" value="underscore"/>
|
7
|
143 <output name="out_file1" file="extract_genomic_dna_out5.fasta" compare="contains" />
|
0
|
144 </test>
|
|
145 <!-- Test custom sequences support and GFF feature interpretation. -->
|
|
146 <test>
|
|
147 <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
|
|
148 <param name="interpret_features" value="no"/>
|
|
149 <param name="index_source" value="history"/>
|
|
150 <param name="ref_file" value="tophat_in1.fasta"/>
|
|
151 <param name="out_format" value="fasta"/>
|
18
|
152 <param name="fasta_header_type" value="char_delimited"/>
|
|
153 <param name="fasta_header_delimiter" value="underscore"/>
|
7
|
154 <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains" />
|
0
|
155 </test>
|
|
156 <test>
|
|
157 <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
|
|
158 <param name="interpret_features" value="yes"/>
|
|
159 <param name="index_source" value="history"/>
|
|
160 <param name="ref_file" value="tophat_in1.fasta"/>
|
|
161 <param name="out_format" value="fasta"/>
|
18
|
162 <param name="fasta_header_type" value="bedtools_getfasta_default"/>
|
7
|
163 <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains" />
|
0
|
164 </test>
|
|
165 </tests>
|
|
166 <help>
|
|
167
|
|
168 .. class:: warningmark
|
|
169
|
7
|
170 This tool requires interval or gff (special tabular formatted data). If your data is not TAB delimited, first use *Text Manipulation->Convert*.
|
|
171
|
|
172 .. class:: warningmark
|
|
173
|
|
174 Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified).
|
|
175
|
|
176 .. class:: warningmark
|
0
|
177
|
7
|
178 All of the following will cause a line from the input dataset to be skipped and a warning generated. The number of warnings and skipped lines is documented in the resulting history item.
|
|
179 - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.
|
|
180 - Sequences that fall outside of the range of a line's start and end coordinates.
|
|
181 - Chromosome, start or end coordinates that are invalid for the specified build.
|
|
182 - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).
|
|
183
|
0
|
184 -----
|
|
185
|
|
186 **What it does**
|
|
187
|
7
|
188 This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.
|
18
|
189
|
19
|
190 If the output format is FASTA, the header format can be specified. Selecting the **bedtools getfasta default**
|
|
191 option produces a FASTA header formatted like the default header produced the the bedtools getfasta tool, and
|
|
192 the "force strandedness" option is assumed. If the input data includes a strand column and the strand is '+'
|
|
193 or '-', it is included in the header. If the input data includes a strand column and the value is anything but
|
|
194 '+' or '-', a '.' is included in the header. If the input data does not include a strand column, a '.' is included
|
|
195 in the header.
|
14
|
196
|
18
|
197 An example FASTA header produced by selecting this option is:
|
|
198
|
|
199 >chr7:127475281-127475310(+)
|
|
200
|
|
201 Selecing the **character delimited field values** option allows selection of a character delimiter that is used
|
|
202 when generating the FASTA header with fields genome, chrom, start, end, strand (name) delimited by the
|
|
203 selected character. For example, selecting an underscore will produce a FASTA header like this:
|
|
204
|
|
205 >mm9_53_550_+ test_chromosome
|
14
|
206
|
20
|
207 while selecting a vertical bar will produce a FASTA header like this:
|
14
|
208
|
18
|
209 >mm9|53|550|+ test_chromosome
|
7
|
210
|
|
211 If strand is not defined, the default value is "+".
|
0
|
212
|
|
213 -----
|
|
214
|
|
215 **Example**
|
|
216
|
|
217 If the input dataset is::
|
|
218
|
|
219 chr7 127475281 127475310 NM_000230 0 +
|
|
220 chr7 127485994 127486166 NM_000230 0 +
|
|
221 chr7 127486011 127486166 D49487 0 +
|
|
222
|
14
|
223 Extracting sequences with **FASTA** output data type and **Description Field Delimiter** set to the underscore character returns::
|
0
|
224
|
|
225 >hg17_chr7_127475281_127475310_+ NM_000230
|
|
226 GTAGGAATCGCAGCGCCAGCGGTTGCAAG
|
|
227 >hg17_chr7_127485994_127486166_+ NM_000230
|
|
228 GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG
|
|
229 GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC
|
|
230 CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG
|
|
231 GATCAATGACATTTCACACACG
|
|
232 >hg17_chr7_127486011_127486166_+ D49487
|
|
233 TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG
|
|
234 CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA
|
|
235 CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
|
|
236 ACACG
|
|
237
|
7
|
238 Extracting sequences with **Interval** output data type returns::
|
|
239
|
|
240 chr7 127475281 127475310 NM_000230 0 + GTAGGAATCGCAGCGCCAGCGGTTGCAAG
|
|
241 chr7 127485994 127486166 NM_000230 0 + GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
|
|
242 chr7 127486011 127486166 D49487 0 + TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
|
|
243
|
0
|
244 </help>
|
|
245 <citations>
|
|
246 <citation type="bibtex">
|
|
247 @unpublished{None,
|
20
|
248 author = {Guru Ananda, Greg Von Kuster},
|
0
|
249 title = {None},
|
|
250 year = {None},
|
|
251 eprint = {None},
|
|
252 url = {http://www.bx.psu.edu/~anton/labSite/}
|
|
253 }</citation>
|
|
254 </citations>
|
|
255 </tool>
|