comparison datasets_gene.xml @ 14:a222b4d3d52e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit d3fa7b70aa028f527a1dbbb210c172c637dfd4d9
author iuc
date Fri, 09 Dec 2022 15:11:04 +0000
parents
children dfad868c911b
comparison
equal deleted inserted replaced
13:d979ba07ddd4 14:a222b4d3d52e
1 <tool id="datasets_download_gene" name="NCBI Datasets Gene" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>download gene sequences and metadata</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="bio_tools"/>
7 <expand macro="requirements"></expand>
8 <expand macro="version_command"/>
9 <command><![CDATA[
10 #import re
11 @SETUP_CERTIFICATES@
12 datasets download gene $query.subcommand.download_by
13 #if $query.subcommand.download_by == 'taxon':
14 '$query.subcommand.taxon_positional'
15 #else:
16 #if $query.subcommand.text_or_file.text_or_file == 'text':
17 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
18 #else
19 --inputfile '$query.subcommand.text_or_file.inputfile'
20 #end if
21 #end if
22
23 #if $query.subcommand.download_by != 'taxon' and $query.subcommand.ortholog:
24 --ortholog '$query.subcommand.ortholog'
25 #end if
26
27 #if $query.subcommand.download_by == 'symbol':
28 #if $query.subcommand.taxon
29 --taxon '$query.subcommand.taxon'
30 #end if
31 #end if
32
33 #if $query.subcommand.download_by == 'accession':
34 #if $query.subcommand.taxon_filter
35 --taxon-filter '$query.subcommand.taxon_filter'
36 #end if
37 #if str($query.subcommand.include_flanks_bp)
38 --include-flanks-bp $query.subcommand.include_flanks_bp
39 #end if
40 #end if
41
42 #if $filters.fasta_filter_cond.fasta_filter_select
43 #if $filters.fasta_filter_cond.fasta_filter_select == 'text'
44 --fasta-filter #echo ",".join(f"'{x}'" for x in $filters.fasta_filter_cond.fasta_filter.split(',') if x)
45 #else
46 --fasta-filter-file '$filters.fasta_filter_cond.fasta_filter_file'
47 #end if
48 #end if
49
50 --include
51 #if $file_choices.kingdom_cond.include
52 #echo ",".join($file_choices.kingdom_cond.include)
53 #else
54 none
55 #end if
56
57 --no-progressbar
58
59 ## produce TSV report file (either gene or prok-gene)
60 &&
61 dataformat
62 tsv
63 $file_choices.kingdom_cond.kingdom_sel
64 --package ncbi_dataset.zip
65 --fields #echo ",".join($file_choices.kingdom_cond.report_columns)
66 > gene_data_report.tsv
67 ## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then
68 ## dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log;
69 ## fi
70
71 #if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include
72 && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv
73 #end if
74
75 ## unzip and rehydrate if any data is to be downloaded (include is not None)
76 #if $file_choices.kingdom_cond.include
77 ## unzip
78 && 7z x -y ncbi_dataset.zip > 7z.log
79 #end if
80 ]]></command>
81 <inputs>
82 <section name="query" title="Query" expanded="true">
83 <conditional name="subcommand">
84 <param name="download_by" type="select" label="Choose how to find genes to download">
85 <option value="gene-id">By NCBI Gene ID</option>
86 <option value="symbol">By Gene symbol</option>
87 <option value="accession">By RefSeq nucleotide or protein accession</option>
88 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
89 </param>
90 <when value="gene-id">
91 <expand macro="text_or_file" what="Gene ID" what_extended="NCBI Gene ID" help=""/>
92 <expand macro="ortholog"/>
93 </when>
94 <when value="symbol">
95 <expand macro="text_or_file" what="Gene Symbol" what_extended="NCBI Gene Symbol" help=""/>
96 <expand macro="ortholog"/>
97 <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">
98 <sanitizer invalid_char="">
99 <valid initial="string.letters">
100 <add value=" " />
101 <add value="-" />
102 </valid>
103 </sanitizer>
104 </param>
105 </when>
106 <when value="accession">
107 <expand macro="text_or_file" what="Gene Accession" what_extended="NCBI Gene Accession" help=""/>
108 <expand macro="ortholog"/>
109 <param argument="--taxon-filter" type="text" value="" label="Limit gene sequences and annotation report file to specified taxon" help="any rank, only available for WP accessions">
110 <sanitizer invalid_char="">
111 <valid initial="string.letters">
112 <add value=" " />
113 <add value="-" />
114 </valid>
115 </sanitizer>
116 </param>
117 <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Length of flanking nucleotides" help="WP accessions only"/>
118 </when>
119 <when value="taxon">
120 <expand macro="taxon_positional"/>
121 </when>
122 </conditional>
123 </section>
124 <section name="filters" title="Filters and Limit">
125 <conditional name="fasta_filter_cond" label="Filter protein and RNA sequences by RefSeq nucleotide and protein accessions">
126 <param name="fasta_filter_select" type="select" label="Apply filter">
127 <option value="">No</option>
128 <option value="text">Enter accessions</option>
129 <option value="file">Read a list of accessions from a dataset</option>
130 </param>
131 <when value=""/>
132 <when value="text">
133 <param argument="--fasta-filter" type="text" label="RefSeq nucleotide and protein accessions" help="Comma separated">
134 <sanitizer invalid_char="">
135 <valid initial="string.letters,string.digits">
136 <add value="," />
137 </valid>
138 </sanitizer>
139 </param>
140 </when>
141 <when value="file">
142 <param argument="--fasta-filter-file" type="data" format="txt" label="Dataset with list of RefSeq nucleotide and protein accessions" help=""/>
143 </when>
144 </conditional>
145 </section>
146 <section name="file_choices" title="Output options" expanded="true">
147 <conditional name="kingdom_cond">
148 <param name="kingdom_sel" type="select" label="Kingdom" help="Prokaryotic: Accessions starting with WP_. Data report has a different format and the rna, cds, 3/5' UTR and gene-product report are not suported. ">
149 <option value="gene">Eukaryote</option>
150 <option value="prok-gene">Prokaryote</option>
151 </param>
152 <when value="gene">
153 <expand macro="gene_tsv_report_columns">
154 <option value="gene-id" selected="true">NCBI GeneID</option>
155 <option value="gene-type" selected="true">Gene Type</option>
156 <option value="common-name" selected="true">Common Name</option>
157 <option value="description" selected="true">Description</option>
158 <option value="symbol" selected="true">Symbol</option>
159 <option value="synonyms" selected="true">Synonyms</option>
160 <option value="tax-id" selected="true">Taxonomic ID</option>
161 <option value="tax-name" selected="true">Taxonomic Name</option>
162 </expand>
163 <expand macro="include">
164 <expand macro="gene_includes">
165 <option value="rna" selected="true">transcript (rna)</option>
166 <option value="cds">nucleotide coding sequences (cds)</option>
167 <option value="5p-utr">5'-UTR (5p-utr)</option>
168 <option value="3p-utr">3'-UTR (3p-utr)</option>
169 <option value="product-report"> (product-report)</option>
170 </expand>
171 </expand>
172 </when>
173 <when value="prok-gene">
174 <expand macro="prok_gene_tsv_report_columns">
175 <option value="accession" selected="true">Accession</option>
176 <option value="description" selected="true">Description</option>
177 <option value="ec-number" selected="true">EC Number</option>
178 <option value="gene-symbol" selected="true">Gene Symbol</option>
179 <option value="mapping-count" selected="true">Number of Genome Mappings</option>
180 <option value="protein-length" selected="true">Protein Length</option>
181 <option value="protein-name" selected="true">Protein Name</option>
182 </expand>
183 <expand macro="include">
184 <expand macro="gene_includes"/>
185 </expand>
186 </when>
187 </conditional>
188 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
189 </section>
190 </inputs>
191 <outputs>
192 <data name="gene_data_report" format="tabular" label="NCBI Gene Datasets: Data Report" from_work_dir="gene_data_report.tsv"/>
193 <data name="gene_product_report" format="tabular" label="NCBI Gene Datasets: Product Report" from_work_dir="gene_product_report.tsv">
194 <filter>file_choices['kingdom_cond']['include'] and "product-report" in file_choices['kingdom_cond']['include']</filter>
195 </data>
196 <data name="gene_fasta" label="NCBI Gene Datasets: Gene fasta" format="fasta" from_work_dir="ncbi_dataset/data/gene.fna">
197 <filter>file_choices['kingdom_cond']['include'] and "gene" in file_choices['kingdom_cond']['include']</filter>
198 </data>
199 <data name="rna_fasta" label="NCBI Gene Datasets: RNA fasta" format="fasta" from_work_dir="ncbi_dataset/data/rna.fna">
200 <filter>file_choices['kingdom_cond']['include'] and "rna" in file_choices['kingdom_cond']['include']</filter>
201 </data>
202 <data name="protein_fasta" label="NCBI Gene Datasets: protein fasta" format="fasta" from_work_dir="ncbi_dataset/data/protein.faa">
203 <filter>file_choices['kingdom_cond']['include'] and "protein" in file_choices['kingdom_cond']['include']</filter>
204 </data>
205 <data name="cds_fasta" label="NCBI Gene Datasets: CDS fasta" format="fasta" from_work_dir="ncbi_dataset/data/cds.fna">
206 <filter>file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']</filter>
207 </data>
208 <data name="threep_utr_fasta" label="NCBI Gene Datasets: 3' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/3p_utr.fna">
209 <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
210 </data>
211 <data name="fivep_utr_fasta" label="NCBI Gene Datasets: 5' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/5p_utr.fna">
212 <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
213 </data>
214 </outputs>
215 <tests>
216 <!-- 1: datasets download gene gene-id 672 -->
217 <test expect_num_outputs="3">
218 <conditional name="query|subcommand">
219 <param name="download_by" value="gene-id"/>
220 <conditional name="text_or_file">
221 <param name="text_or_file" value="text"/>
222 <param name="accession" value="672"/>
223 </conditional>
224 </conditional>
225 <output name="gene_data_report">
226 <assert_contents>
227 <has_text text="human"/>
228 <has_text text="BRCA1"/>
229 <has_n_lines n="2"/>
230 <has_n_columns n="8"/>
231 </assert_contents>
232 </output>
233 <output name="rna_fasta">
234 <assert_contents>
235 <has_text text=">"/>
236 </assert_contents>
237 </output>
238 <output name="protein_fasta">
239 <assert_contents>
240 <has_text text=">"/>
241 </assert_contents>
242 </output>
243 </test>
244 <!-- 2: datasets download gene gene-id 2597 14433 -->
245 <test expect_num_outputs="3">
246 <conditional name="query|subcommand">
247 <param name="download_by" value="gene-id"/>
248 <conditional name="text_or_file">
249 <param name="text_or_file" value="text"/>
250 <param name="accession" value="2597,14433"/>
251 </conditional>
252 </conditional>
253 <output name="gene_data_report">
254 <assert_contents>
255 <has_text text="house mouse"/>
256 <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
257 <has_n_lines n="3"/>
258 <has_n_columns n="8"/>
259 </assert_contents>
260 </output>
261 <output name="rna_fasta">
262 <assert_contents>
263 <has_text text=">"/>
264 </assert_contents>
265 </output>
266 <output name="protein_fasta">
267 <assert_contents>
268 <has_text text=">"/>
269 </assert_contents>
270 </output>
271 </test>
272 <!-- 3: same as above + give accessions by file, 2 different outputs and ortholog-->
273 <test expect_num_outputs="3">
274 <conditional name="query|subcommand">
275 <param name="download_by" value="gene-id"/>
276 <conditional name="text_or_file">
277 <param name="text_or_file" value="file"/>
278 <param name="inputfile" value="geneids.txt"/>
279 </conditional>
280 <param name="ortholog" value="Haplorrhini,Strepsirrhini"/>
281 </conditional>
282 <section name="file_choices">
283 <conditional name="kingdom_cond">
284 <param name="include" value="gene,cds"/>
285 </conditional>
286 </section>
287 <output name="gene_data_report">
288 <assert_contents>
289 <has_text text="baboon"/>
290 <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
291 <has_n_lines min="30"/>
292 <has_n_columns n="8"/>
293 </assert_contents>
294 </output>
295 <output name="gene_fasta">
296 <assert_contents>
297 <has_text text=">"/>
298 </assert_contents>
299 </output>
300 <output name="cds_fasta">
301 <assert_contents>
302 <has_text text=">"/>
303 </assert_contents>
304 </output>
305 </test>
306 <!-- 4: datasets download gene symbol tp53 -->
307 <test expect_num_outputs="1">
308 <conditional name="query|subcommand">
309 <param name="download_by" value="symbol"/>
310 <conditional name="text_or_file">
311 <param name="text_or_file" value="text"/>
312 <param name="accession" value="tp53"/>
313 </conditional>
314 </conditional>
315 <section name="file_choices">
316 <conditional name="kingdom_cond">
317 <param name="include" value=""/>
318 </conditional>
319 </section>
320 <output name="gene_data_report">
321 <assert_contents>
322 <has_text text="human"/>
323 <has_n_lines n="2"/>
324 <has_n_columns n="8"/>
325 </assert_contents>
326 </output>
327 </test>
328 <!-- 5: datasets download gene symbol brca1 \-\-taxon mouse -->
329 <test expect_num_outputs="4">
330 <conditional name="query|subcommand">
331 <param name="download_by" value="symbol"/>
332 <conditional name="text_or_file">
333 <param name="text_or_file" value="text"/>
334 <param name="accession" value="brca1"/>
335 </conditional>
336 <param name="taxon" value="mouse"/>
337 </conditional>
338 <section name="file_choices">
339 <conditional name="kingdom_cond">
340 <param name="include" value="3p-utr,5p-utr,product-report"/>
341 </conditional>
342 </section>
343 <output name="gene_data_report">
344 <assert_contents>
345 <has_text text="house mouse"/>
346 <has_text text="Brca1"/>
347 <has_n_lines n="2"/>
348 <has_n_columns n="8"/>
349 </assert_contents>
350 </output>
351 <output name="gene_product_report">
352 <assert_contents>
353 <has_text text="house mouse"/>
354 <has_text text="XR_004936704.1"/>
355 <has_n_lines min="130"/>
356 <has_n_columns n="38"/>
357 </assert_contents>
358 </output>
359 <output name="threep_utr_fasta">
360 <assert_contents>
361 <has_text text=">"/>
362 </assert_contents>
363 </output>
364 <output name="fivep_utr_fasta">
365 <assert_contents>
366 <has_text text=">"/>
367 </assert_contents>
368 </output>
369 </test>
370 <!-- 6: datasets download gene symbol brca1 \-\-ortholog -->
371 <test expect_num_outputs="1">
372 <conditional name="query|subcommand">
373 <param name="download_by" value="symbol"/>
374 <conditional name="text_or_file">
375 <param name="text_or_file" value="text"/>
376 <param name="accession" value="brca1"/>
377 </conditional>
378 <param name="ortholog" value="rodentia"/>
379 </conditional>
380 <section name="file_choices">
381 <conditional name="kingdom_cond">
382 <param name="include" value=""/>
383 </conditional>
384 </section>
385 <output name="gene_data_report">
386 <assert_contents>
387 <has_text text="rat"/>
388 <has_text text="Brca1"/>
389 <has_n_lines min="30"/>
390 <has_n_columns n="8"/>
391 </assert_contents>
392 </output>
393 </test>
394
395 <!-- 7: datasets download gene accession NP_000483.3 -->
396 <test expect_num_outputs="1">
397 <conditional name="query|subcommand">
398 <param name="download_by" value="accession"/>
399 <conditional name="text_or_file">
400 <param name="text_or_file" value="text"/>
401 <param name="accession" value="NP_000483.3"/>
402 </conditional>
403 </conditional>
404 <section name="file_choices">
405 <conditional name="kingdom_cond">
406 <param name="include" value=""/>
407 </conditional>
408 </section>
409 <output name="gene_data_report">
410 <assert_contents>
411 <has_text text="human"/>
412 <has_n_lines n="2"/>
413 <has_n_columns n="8"/>
414 </assert_contents>
415 </output>
416 </test>
417 <!-- 8: datasets download gene accession NM_000546.6 NM_000492.4 + ortholog-->
418 <test expect_num_outputs="1">
419 <conditional name="query|subcommand">
420 <param name="download_by" value="accession"/>
421 <conditional name="text_or_file">
422 <param name="text_or_file" value="text"/>
423 <param name="accession" value="NM_000546.6 NM_000492.4"/>
424 </conditional>
425 <param name="ortholog" value="true"/>
426 </conditional>
427 <section name="file_choices">
428 <conditional name="kingdom_cond">
429 <param name="include" value=""/>
430 </conditional>
431 </section>
432 <output name="gene_data_report">
433 <assert_contents>
434 <has_text text="human"/>
435 <has_n_lines min="800"/>
436 <has_n_columns n="8"/>
437 </assert_contents>
438 </output>
439 </test>
440
441 <!-- 9: datasets download gene accession WP_004675351.1 + include_flanks_bp -->
442 <test expect_num_outputs="3">
443 <conditional name="query|subcommand">
444 <param name="download_by" value="accession"/>
445 <conditional name="text_or_file">
446 <param name="text_or_file" value="text"/>
447 <param name="accession" value="WP_004675351.1"/>
448 </conditional>
449 <param name="include_flanks_bp" value="100"/>
450 </conditional>
451 <section name="file_choices">
452 <conditional name="kingdom_cond">
453 <param name="kingdom_sel" value="prok-gene"/>
454 <param name="include" value="gene,protein"/>
455 </conditional>
456 </section>
457 <output name="gene_data_report">
458 <assert_contents>
459 <has_text text="glcE"/>
460 <has_n_lines n="2"/>
461 <has_n_columns n="7"/>
462 </assert_contents>
463 </output>
464 <output name="gene_fasta">
465 <assert_contents>
466 <has_text text=">"/>
467 </assert_contents>
468 </output>
469 <output name="protein_fasta">
470 <assert_contents>
471 <has_text text=">"/>
472 </assert_contents>
473 </output>
474 <assert_command>
475 <has_text text="include-flanks-bp 100"/>
476 </assert_command>
477 </test>
478
479 <!-- 10: datasets download gene taxon human -->
480 <test expect_num_outputs="1">
481 <conditional name="query|subcommand">
482 <param name="download_by" value="taxon"/>
483 <param name="taxon_positional" value="human"/>
484 </conditional>
485 <section name="file_choices">
486 <conditional name="kingdom_cond">
487 <param name="include" value=""/>
488 </conditional>
489 </section>
490 <output name="gene_data_report">
491 <assert_contents>
492 <has_text text="human"/>
493 <has_n_lines min="72000"/>
494 <has_n_columns n="8"/>
495 </assert_contents>
496 </output>
497 </test>
498 <!-- 11: datasets download gene taxon human + \-\-fasta-filter -->
499 <test expect_num_outputs="2">
500 <conditional name="query|subcommand">
501 <param name="download_by" value="taxon"/>
502 <param name="taxon_positional" value="human"/>
503 </conditional>
504 <section name="file_choices">
505 <conditional name="kingdom_cond">
506 <param name="include" value="protein"/>
507 </conditional>
508 </section>
509 <section name="filters">
510 <conditional name="fasta_filter_cond">
511 <param name="fasta_filter_select" value="text"/>
512 <param name="fasta_filter" value="NP_542432.2"/>
513 </conditional>
514 </section>
515 <output name="gene_data_report">
516 <assert_contents>
517 <has_text text="human"/>
518 <has_n_lines min="72000"/>
519 <has_n_columns n="8"/>
520 </assert_contents>
521 </output>
522 <output name="protein_fasta">
523 <assert_contents>
524 <has_text text=">" n="1" />
525 </assert_contents>
526 </output></test>
527 </tests>
528 <help>
529 <![CDATA[
530 **Download Gene Datasets from NCBI**
531
532 Download a gene dataset (gene sequence, transcipt, amino acid sequences,
533 nucleotide coding sequences, 5'-UTR, 3'-UTR) as well as gene and gene
534 product reports. Genes can be referred by gene id, symbol, accession,
535 or taxon.
536 ]]>
537 </help>
538 <expand macro="citations"/>
539 </tool>