comparison datasets_genome.xml @ 20:35d32c807c23 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
author iuc
date Fri, 26 Dec 2025 17:16:51 +0000
parents 9a10a6449901
children
comparison
equal deleted inserted replaced
19:ced734560c9d 20:35d32c807c23
2 <description>download genome sequence, annotation and metadata</description> 2 <description>download genome sequence, annotation and metadata</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="bio_tools"/> 6 <expand macro="bio_tools"/>
7 <expand macro="requirements"></expand> 7 <expand macro="requirements"/>
8 <expand macro="version_command"/> 8 <expand macro="version_command"/>
9 <command><![CDATA[ 9 <stdio>
10 <regex match="Warning" source="stderr" level="warning" description=""/>
11 <regex match="skipping" source="stderr" level="warning" description=""/>
12 <regex match="ERROR" level="fatal"/>
13 </stdio>
14 <command detect_errors="exit_code"><![CDATA[
10 #import re 15 #import re
11 @SETUP_CERTIFICATES@ 16 @SETUP_CERTIFICATES@
12 datasets download genome $query.subcommand.download_by 17 datasets download genome $query.subcommand.download_by
13 #if $query.subcommand.download_by == 'accession': 18 #if $query.subcommand.download_by == 'accession':
14 #if $query.subcommand.text_or_file.text_or_file == 'text': 19 #if $query.subcommand.text_or_file.text_or_file == 'text':
39 44
40 @INCLUDE@ 45 @INCLUDE@
41 @RELEASED_BEFORE@ 46 @RELEASED_BEFORE@
42 @RELEASED_AFTER@ 47 @RELEASED_AFTER@
43 #for search_term in $filters.search: 48 #for search_term in $filters.search:
44 --search '$filters.search_term' 49 --search '$search_term.search'
45 #end for 50 #end for
46 --no-progressbar 51 --no-progressbar
47 --dehydrated 52 --dehydrated
48 53
49 ## produce TSV report file 54 ## produce TSV report file
114 <option value="only" selected="false">Limit to MAGs</option> 119 <option value="only" selected="false">Limit to MAGs</option>
115 <option value="exclude" selected="false">Exclude MAGs</option> 120 <option value="exclude" selected="false">Exclude MAGs</option>
116 </param> 121 </param>
117 <expand macro="released_options"/> 122 <expand macro="released_options"/>
118 <expand macro="released_options" before_or_after="after"/> 123 <expand macro="released_options" before_or_after="after"/>
119
120 <repeat name="search" title="Add search terms"> 124 <repeat name="search" title="Add search terms">
121 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> 125 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
122 </repeat> 126 </repeat>
123 </section> 127 </section>
124 <section name="file_choices" title="Output options" expanded="true"> 128 <section name="file_choices" title="Output options" expanded="true">
135 </section> 139 </section>
136 </inputs> 140 </inputs>
137 <outputs> 141 <outputs>
138 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> 142 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
139 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> 143 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
140 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 144 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
141 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> 145 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
142 </collection> 146 </collection>
143 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> 147 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
144 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 148 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
145 <filter>file_choices['include'] and "genome" in file_choices['include']</filter> 149 <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
146 </collection> 150 </collection>
147 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> 151 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
148 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 152 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
149 <filter>file_choices['include'] and "rna" in file_choices['include']</filter> 153 <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
150 </collection> 154 </collection>
151 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> 155 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
152 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 156 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
153 <filter>file_choices['include'] and "protein" in file_choices['include']</filter> 157 <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
154 </collection> 158 </collection>
155 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> 159 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
156 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 160 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
157 <filter>file_choices['include'] and "cds" in file_choices['include']</filter> 161 <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
158 </collection> 162 </collection>
159 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> 163 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
160 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 164 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
161 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> 165 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
162 </collection> 166 </collection>
163 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> 167 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
164 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 168 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
165 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> 169 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
166 </collection> 170 </collection>
167 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> 171 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
168 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 172 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
169 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> 173 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
170 </collection> 174 </collection>
171 </outputs> 175 </outputs>
172 <tests> 176 <tests>
173 <test expect_num_outputs="3"> 177 <test expect_num_outputs="3">
174 <conditional name="query|subcommand"> 178 <conditional name="query|subcommand">
175 <param name="download_by" value="taxon"/> 179 <param name="download_by" value="taxon"/>
176 <param name="taxon_positional" value="human"/> 180 <param name="taxon_positional" value="human"/>
177 </conditional> 181 </conditional>
178 <param name="chromosomes" value="21"/> 182 <section name="filters">
179 <param name="released_before" value="01/01/2018"/> 183 <param name="chromosomes" value="21"/>
184 <param name="released_before" value="01/01/2018"/>
185 </section>
180 <section name="file_choices"> 186 <section name="file_choices">
181 <!-- include a sequence (which should be downloaded as fasta.gz) 187 <!-- include a sequence (which should be downloaded as fasta.gz)
182 and one non-sequence (which should be decompressed) output --> 188 and one non-sequence (which should be decompressed) output -->
183 <param name="include" value="rna,gff3"/> 189 <param name="include" value="rna,gff3"/>
184 </section> 190 </section>
185 <output name="genome_data_report"> 191 <output name="genome_data_report">
186 <assert_contents> 192 <assert_contents>
187 <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/> 193 <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
188 <has_n_lines n="142"/> 194 <has_n_lines min="140"/>
189 <has_n_columns n="4"/> 195 <has_n_columns n="4"/>
190 </assert_contents> 196 </assert_contents>
191 </output> 197 </output>
192 <output_collection name="rna_fasta" type="list" count="1"> 198 <output_collection name="rna_fasta" type="list">
193 <element name="GCF_000306695.2" decompress="true"> 199 <element name="GCF_000306695.2" decompress="true">
194 <assert_contents> 200 <assert_contents>
195 <has_text text=">"/> 201 <has_text text="&gt;"/>
196 </assert_contents> 202 </assert_contents>
197 </element> 203 </element>
198 </output_collection> 204 </output_collection>
199 <output_collection name="genomic_gff" type="list"> 205 <output_collection name="genomic_gff" type="list">
200 <element name="GCF_000306695.2"> 206 <element name="GCF_000306695.2">
210 </assert_command> 216 </assert_command>
211 </test> 217 </test>
212 <test expect_num_outputs="2"> 218 <test expect_num_outputs="2">
213 <conditional name="query|subcommand"> 219 <conditional name="query|subcommand">
214 <param name="download_by" value="taxon"/> 220 <param name="download_by" value="taxon"/>
215 <param name="taxon_positional" value="human"/> 221 <param name="taxon_positional" value="Norway rat"/>
216 </conditional> 222 </conditional>
217 <param name="chromosomes" value="21"/> 223 <section name="filters">
218 <param name="assembly_level" value="chromosome,complete"/> 224 <param name="chromosomes" value="MT"/>
219 <param name="released_before" value="01/01/2018"/> 225 </section>
220 <section name="file_choices"> 226 <section name="file_choices">
221 <param name="include" value="genome"/> 227 <param name="include" value="genome"/>
222 <param name="decompress" value="true"/> 228 <param name="decompress" value="true"/>
223 </section> 229 </section>
224 <output_collection name="genome_fasta" type="list:list" count="12"> 230 <output_collection name="genome_fasta" type="list:list" count="9">
225 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> 231 <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression="&gt;"/>
226 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> 232 <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression="&gt;"/>
227 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> 233 <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression="&gt;"/>
228 <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/> 234 <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression="&gt;"/>
229 <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/> 235 <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression="&gt;"/>
230 <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/> 236 <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression="&gt;"/>
231 <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/> 237 <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression="&gt;"/>
232 <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/> 238 <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression="&gt;"/>
233 <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/> 239 <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression="&gt;"/>
234 <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
235 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
236 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
237 <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 --> 240 <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 -->
238 <!-- 241 <!--
239 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/> 242 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
240 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> 243 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
241 --> 244 -->
242 </output_collection> 245 </output_collection>
243 <output name="genome_data_report"> 246 <output name="genome_data_report">
244 <assert_contents> 247 <assert_contents>
245 <has_text text="Homo sapiens"/> 248 <has_text text="Rattus norvegicus"/>
246 <has_n_columns n="4"/> 249 <has_n_columns n="4"/>
247 </assert_contents> 250 </assert_contents>
248 </output> 251 </output>
249 </test> 252 </test>
250 <!-- same as previous test but assembly_source=refseq, which removes all of the genomes --> 253 <!-- same as previous test but assembly_source=refseq, which removes all of the genomes -->
251 <test expect_failure="true"> 254 <test expect_failure="true">
252 <conditional name="query|subcommand"> 255 <conditional name="query|subcommand">
253 <param name="download_by" value="taxon"/> 256 <param name="download_by" value="taxon"/>
254 <param name="taxon_positional" value="human"/> 257 <param name="taxon_positional" value="human"/>
255 </conditional> 258 </conditional>
256 <param name="chromosomes" value="21"/> 259 <section name="filters">
257 <param name="assembly_level" value="chromosome,complete"/> 260 <param name="chromosomes" value="21"/>
258 <param name="assembly_source" value="refseq"/> 261 <param name="assembly_level" value="chromosome,complete"/>
259 <param name="released_before" value="01/01/2018"/> 262 <param name="assembly_source" value="refseq"/>
263 <param name="released_before" value="01/01/2018"/>
264 </section>
260 <section name="file_choices"> 265 <section name="file_choices">
261 <param name="include" value="genome"/> 266 <param name="include" value="genome"/>
262 <param name="decompress" value="true"/> 267 <param name="decompress" value="true"/>
263 </section> 268 </section>
264 <assert_stderr> 269 <assert_stderr>
286 <conditional name="text_or_file"> 291 <conditional name="text_or_file">
287 <param name="text_or_file" value="text"/> 292 <param name="text_or_file" value="text"/>
288 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> 293 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
289 </conditional> 294 </conditional>
290 </conditional> 295 </conditional>
291 <param name="released_before" value="01/01/2007"/> 296 <section name="filters">
297 <param name="released_before" value="01/01/2007"/>
298 </section>
292 <section name="file_choices"> 299 <section name="file_choices">
293 <param name="include" value="seq-report,gtf,cds"/> 300 <param name="include" value="seq-report,gtf,cds"/>
294 <param name="decompress" value="true"/> 301 <param name="decompress" value="true"/>
295 </section> 302 </section>
296 <output name="genome_data_report"> 303 <output name="genome_data_report">
298 <has_text text="GCF_000013305.1"/> 305 <has_text text="GCF_000013305.1"/>
299 <has_n_lines n="3"/> 306 <has_n_lines n="3"/>
300 <has_n_columns n="4"/> 307 <has_n_columns n="4"/>
301 </assert_contents> 308 </assert_contents>
302 </output> 309 </output>
303 <output_collection name="sequence_report" type="list" count="2" > 310 <output_collection name="sequence_report" type="list" count="2">
304 <element name="GCF_000007445.1"> 311 <element name="GCF_000007445.1">
305 <assert_contents> 312 <assert_contents>
306 <has_text text="GCF_000007445.1"/> 313 <has_text text="GCF_000007445.1"/>
307 <has_n_lines n="2"/> 314 <has_n_lines n="2"/>
308 <has_n_columns n="15"/> 315 <has_n_columns n="15"/>
314 <has_n_lines n="2"/> 321 <has_n_lines n="2"/>
315 <has_n_columns n="15"/> 322 <has_n_columns n="15"/>
316 </assert_contents> 323 </assert_contents>
317 </element> 324 </element>
318 </output_collection> 325 </output_collection>
319 <output_collection name="genomic_gtf" type="list"> 326 <output_collection name="genomic_gtf" type="list" count="2">
320 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> 327 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
321 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> 328 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
322 </output_collection> 329 </output_collection>
323 <output_collection name="genomic_cds" type="list"> 330 <output_collection name="genomic_cds" type="list">
324 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> 331 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
331 <conditional name="text_or_file"> 338 <conditional name="text_or_file">
332 <param name="text_or_file" value="file"/> 339 <param name="text_or_file" value="file"/>
333 <param name="inputfile" value="accessions.txt"/> 340 <param name="inputfile" value="accessions.txt"/>
334 </conditional> 341 </conditional>
335 </conditional> 342 </conditional>
336 <param name="released_before" value="01/01/2007"/> 343 <section name="filters">
344 <param name="released_before" value="01/01/2007"/>
345 </section>
337 <section name="file_choices"> 346 <section name="file_choices">
338 <param name="include" value="seq-report,gff3,gbff"/> 347 <param name="include" value="seq-report,gff3,gbff"/>
339 <param name="decompress" value="true"/> 348 <param name="decompress" value="true"/>
340 </section> 349 </section>
341 <output name="genome_data_report"> 350 <output name="genome_data_report">
353 <output_collection name="genomic_gbff" type="list"> 362 <output_collection name="genomic_gbff" type="list">
354 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> 363 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
355 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> 364 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
356 </output_collection> 365 </output_collection>
357 </test> 366 </test>
358
359 <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> 367 <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
360 <test expect_num_outputs="2"> 368 <test expect_num_outputs="2">
361 <conditional name="query|subcommand"> 369 <conditional name="query|subcommand">
362 <param name="download_by" value="accession"/> 370 <param name="download_by" value="accession"/>
363 <conditional name="text_or_file"> 371 <conditional name="text_or_file">
364 <param name="text_or_file" value="text"/> 372 <param name="text_or_file" value="text"/>
365 <param name="accession" value="GCF_000001405"/> 373 <param name="accession" value="GCF_000001405"/>
366 </conditional> 374 </conditional>
367 </conditional> 375 </conditional>
368 <param name="released_before" value="01/01/2015"/> 376 <section name="filters">
369 <param name="assembly_version" value="all"/> 377 <param name="released_before" value="01/01/2015"/>
378 <param name="assembly_version" value="all"/>
379 </section>
370 <section name="file_choices"> 380 <section name="file_choices">
371 <param name="include" value="seq-report"/> 381 <param name="include" value="seq-report"/>
372 </section> 382 </section>
373 <output name="genome_data_report"> 383 <output name="genome_data_report">
374 <!-- assert that we get at least the 16 versions available at the time of writing this test --> 384 <!-- assert that we get at least the 16 versions available at the time of writing this test -->
393 <section name="file_choices"> 403 <section name="file_choices">
394 <param name="include" value="genome,protein,rna,cds"/> 404 <param name="include" value="genome,protein,rna,cds"/>
395 <param name="decompress" value="true"/> 405 <param name="decompress" value="true"/>
396 </section> 406 </section>
397 <output_collection name="genome_fasta" type="list:list" count="1"> 407 <output_collection name="genome_fasta" type="list:list" count="1">
398 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> 408 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
399 </output_collection> 409 </output_collection>
400 <output_collection name="protein_fasta" type="list" count="1"> 410 <output_collection name="protein_fasta" type="list" count="1">
401 <element name="GCF_000146045.2" decompress="true"> 411 <element name="GCF_000146045.2" decompress="true">
402 <assert_contents> 412 <assert_contents>
403 <has_text text=">"/> 413 <has_text text="&gt;"/>
404 </assert_contents> 414 </assert_contents>
405 </element> 415 </element>
406 </output_collection> 416 </output_collection>
407 <output_collection name="rna_fasta" type="list" count="1"> 417 <output_collection name="rna_fasta" type="list" count="1">
408 <element name="GCF_000146045.2" decompress="true"> 418 <element name="GCF_000146045.2" decompress="true">
409 <assert_contents> 419 <assert_contents>
410 <has_text text=">"/> 420 <has_text text="&gt;"/>
411 </assert_contents> 421 </assert_contents>
412 </element> 422 </element>
413 </output_collection> 423 </output_collection>
414 </test> 424 </test>
415 <!-- same as the previous test, but use the default value for decompress, 425 <!-- same as the previous test, but use the default value for decompress,
435 </element> 445 </element>
436 </output_collection> 446 </output_collection>
437 <output_collection name="protein_fasta" type="list" count="1"> 447 <output_collection name="protein_fasta" type="list" count="1">
438 <element name="GCF_000146045.2" ftype="fasta.gz"> 448 <element name="GCF_000146045.2" ftype="fasta.gz">
439 <assert_contents> 449 <assert_contents>
440 <has_size value="1845038" delta="2000"/> 450 <has_size value="1847862" delta="2000"/>
441 </assert_contents> 451 </assert_contents>
442 </element> 452 </element>
443 </output_collection> 453 </output_collection>
444 <output_collection name="rna_fasta" type="list" count="1"> 454 <output_collection name="rna_fasta" type="list" count="1">
445 <element name="GCF_000146045.2" ftype="fasta.gz"> 455 <element name="GCF_000146045.2" ftype="fasta.gz">
461 <param name="include" value="seq-report,genome"/> 471 <param name="include" value="seq-report,genome"/>
462 <param name="decompress" value="true"/> 472 <param name="decompress" value="true"/>
463 </section> 473 </section>
464 <output_collection name="sequence_report" type="list" count="2"/> 474 <output_collection name="sequence_report" type="list" count="2"/>
465 <output_collection name="genome_fasta" type="list:list" count="2"> 475 <output_collection name="genome_fasta" type="list:list" count="2">
466 <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> 476 <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression="&gt;NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
467 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> 477 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
468 </output_collection> 478 </output_collection>
469 </test> 479 </test>
470 <!-- tax_exact_match should filter out strains 480 <!-- tax_exact_match should filter out strains
471 https://github.com/ncbi/datasets/issues/187 --> 481 https://github.com/ncbi/datasets/issues/187 -->
472 <test expect_num_outputs="1"> 482 <test expect_num_outputs="2">
473 <conditional name="query|subcommand"> 483 <conditional name="query|subcommand">
474 <param name="download_by" value="taxon"/> 484 <param name="download_by" value="taxon"/>
475 <param name="taxon_positional" value="4932"/> 485 <param name="taxon_positional" value="4932"/>
476 <param name="tax_exact_match" value="true"/> 486 <param name="tax_exact_match" value="true"/>
477 </conditional> 487 </conditional>
478 <param name="include" value=""/> 488 <output name="genome_data_report">
479 <output name="genome_data_report"> 489 <assert_contents>
480 <assert_contents> 490 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
481 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> 491 </assert_contents>
492 </output>
493 </test>
494 <!-- test search filter -->
495 <test expect_num_outputs="1">
496 <conditional name="query|subcommand">
497 <param name="download_by" value="taxon"/>
498 <param name="taxon_positional" value="Streptococcus"/>
499 </conditional>
500 <section name="filters">
501 <repeat name="search">
502 <param name="search" value="pyogenes"/>
503 </repeat>
504 </section>
505 <section name="file_choices">
506 <param name="include" value_json="null"/>
507 </section>
508 <output name="genome_data_report">
509 <assert_contents>
510 <has_text text="pyogenes"/>
482 </assert_contents> 511 </assert_contents>
483 </output> 512 </output>
484 </test> 513 </test>
485 </tests> 514 </tests>
486 <help> 515 <help><![CDATA[
487 <![CDATA[ 516 .. class:: infomark
488 **Download Genome Datasets from NCBI** 517
489 518 **What it does**
490 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. 519
491 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. 520 Downloads genome assemblies from NCBI using the `datasets`_ command-line tool.
492 521 Retrieve genome sequences, annotations, and metadata by accession or taxon.
493 The download is a three step process: 522
494 523 **Query Options**
495 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) 524
496 2. The metadata is transformed into a tabular (TSV) file 525 - **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession
497 3. The data is hydrated (the actual data is downloaded) 526 - **By Taxon**: Taxonomy ID, scientific name, or common name
498 527
499 The 3rd step can be skipped by unselecting all output types in the `Include` parameter. 528 **Filters**
500 Thereby its possible to inspect the metadata prior to the actual data download. Also this 529
501 allows to use the tool for querying data sets (and their accessions) of interest which 530 ==================== ===============================================
502 can then be downloaded in a second call using the accessions. 531 Filter Description
503 ]]> 532 ==================== ===============================================
504 </help> 533 Reference only Limit to reference/representative assemblies
534 Annotated only Include only genomes with annotations
535 Assembly level Chromosome, complete, contig, or scaffold
536 Assembly source RefSeq (GCF\_) or GenBank (GCA\_)
537 Exclude atypical Remove atypical assemblies (e.g., partial)
538 MAG filter Include/exclude metagenome-assembled genomes
539 Date range Filter by release date
540 ==================== ===============================================
541
542 ----
543
544 .. class:: warningmark
545
546 **Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies.
547 If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results
548 with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug).
549
550 **Outputs**
551
552 - **Data Report**: Tabular metadata for matching assemblies
553 - **Genome FASTA**: Genomic sequences (nested collection by accession)
554 - **Annotation files**: GFF3, GTF, GenBank flat files
555 - **Protein/RNA/CDS**: Amino acid and nucleotide sequences
556 - **Sequence Report**: Per-sequence metadata (chromosome, length, etc.)
557
558 .. _datasets: https://www.ncbi.nlm.nih.gov/datasets/
559
560 ]]></help>
505 <expand macro="citations"/> 561 <expand macro="citations"/>
506 </tool> 562 </tool>