comparison datasets_genome.xml @ 14:a222b4d3d52e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit d3fa7b70aa028f527a1dbbb210c172c637dfd4d9
author iuc
date Fri, 09 Dec 2022 15:11:04 +0000
parents d979ba07ddd4
children dfad868c911b
comparison
equal deleted inserted replaced
13:d979ba07ddd4 14:a222b4d3d52e
1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> 1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>download genome sequence, annotation and metadata</description> 2 <description>download genome sequence, annotation and metadata</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="bio_tools"/>
6 <expand macro="requirements"></expand> 7 <expand macro="requirements"></expand>
8 <expand macro="version_command"/>
7 <command><![CDATA[ 9 <command><![CDATA[
10 #import re
8 @SETUP_CERTIFICATES@ 11 @SETUP_CERTIFICATES@
9 datasets download genome $query.subcommand.download_by 12 datasets download genome $query.subcommand.download_by
10 #if $query.subcommand.download_by == 'accession': 13 #if $query.subcommand.download_by == 'accession':
11 #if $query.subcommand.text_or_file.text_or_file == 'text': 14 #if $query.subcommand.text_or_file.text_or_file == 'text':
12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) 15 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
13 #else 16 #else
14 --inputfile '$query.subcommand.text_or_file.inputfile' 17 --inputfile '$query.subcommand.text_or_file.inputfile'
15 #end if 18 #end if
16 #else: 19 #else:
17 '$query.subcommand.taxon' 20 '$query.subcommand.taxon_positional'
18 $query.subcommand.tax_exact_match 21 $query.subcommand.tax_exact_match
19 #end if 22 #end if
20 $filters.reference 23 $filters.reference
21 $filters.annotated 24 $filters.annotated
22 #if $filters.assembly_level: 25 #if $filters.assembly_level:
35 @RELEASED_AFTER@ 38 @RELEASED_AFTER@
36 #for search_term in $filters.search: 39 #for search_term in $filters.search:
37 --search '$filters.search_term' 40 --search '$filters.search_term'
38 #end for 41 #end for
39 --no-progressbar 42 --no-progressbar
40 #if $uncompressed 43 --dehydrated
41 && 7z x -y ncbi_dataset.zip 44
42 #else 45 ## produce TSV report file
43 && 7z l ncbi_dataset.zip > ncbi_dataset.txt 46 && dataformat tsv genome
47 --package ncbi_dataset.zip
48 --fields #echo ",".join($file_choices.report_columns)
49 > genome_data_report.tsv
50
51 ## unzip and rehydrate if any data is to be downloaded (include is not None)
52 #if $file_choices.include
53 ## unzip
54 && 7z x -y ncbi_dataset.zip > 7z.log
55
56 ## rehydrate
57 && datasets rehydrate
58 --directory ./
59 #if not $file_choices.decompress
60 --gzip
61 #end if
62 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
63
64 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
65 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
66
67 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
68 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
69 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
70 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
71 #if not $file_choices.decompress
72 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
73 #end if
74
75 #if "seq-report" in $file_choices.include
76 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
77 #end if
78
79 && true ## because Galaxy removes trailing ; from command
44 #end if 80 #end if
45 ]]></command> 81 ]]></command>
46 <inputs> 82 <inputs>
47 <section name="query" title="Query" expanded="true"> 83 <section name="query" title="Query" expanded="true">
48 <conditional name="subcommand"> 84 <conditional name="subcommand">
49 <param name="download_by" type="select" label="Choose how to find genomes to download"> 85 <param name="download_by" type="select" label="Choose how to find genomes to download">
50 <option value="accession">Download by NCBI assembly or BioProject accession</option> 86 <option value="accession">By NCBI assembly or BioProject accession</option>
51 <option value="taxon">Download by taxon</option> 87 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
52 </param> 88 </param>
53 <when value="accession"> 89 <when value="accession">
54 <expand macro="text_or_file"/> 90 <expand macro="text_or_file"/>
55 </when> 91 </when>
56 <when value="taxon"> 92 <when value="taxon">
57 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/> 93 <expand macro="taxon_positional"/>
58 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> 94 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
59 </when> 95 </when>
60 </conditional> 96 </conditional>
61 </section> 97 </section>
62 <section name="filters" title="Filters and Limit"> 98 <section name="filters" title="Filters and Limit">
65 <expand macro="assembly_level"/> 101 <expand macro="assembly_level"/>
66 <param argument="--assembly-version" type="select" label="Assembly version(s)"> 102 <param argument="--assembly-version" type="select" label="Assembly version(s)">
67 <option value="latest">Latest</option> 103 <option value="latest">Latest</option>
68 <option value="all">All</option> 104 <option value="all">All</option>
69 </param> 105 </param>
70 <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank-->
71 <expand macro="assembly_source"/> 106 <expand macro="assembly_source"/>
72 <expand macro="chromosomes"/> 107 <expand macro="chromosomes"/>
73 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> 108 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
74 <expand macro="released_options"/> 109 <expand macro="released_options"/>
75 <expand macro="released_options" before_or_after="after"/> 110 <expand macro="released_options" before_or_after="after"/>
76 111
77 <repeat name="search" title="Add search terms"> 112 <repeat name="search" title="Add search terms">
78 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> 113 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
79 </repeat> 114 </repeat>
80 </section> 115 </section>
81 <section name="file_choices" title="File Choices" expanded="true"> 116 <section name="file_choices" title="Output options" expanded="true">
82 <expand macro="include"/> 117 <expand macro="tsv_report_columns">
118 <option value="accession" selected="true">accession</option>
119 <option value="organism-name" selected="true">organism-name</option>
120 <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
121 <option value="assminfo-name" selected="true">assminfo-name</option>
122 </expand>
123 <expand macro="include">
124 <expand macro="genome_includes"/>
125 </expand>
126 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
83 </section> 127 </section>
84 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
85 </inputs> 128 </inputs>
86 <outputs> 129 <outputs>
87 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> 130 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
88 <filter>not uncompressed</filter>
89 </data>
90 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
91 <filter>not uncompressed</filter>
92 </data>
93 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
94 <filter>uncompressed</filter>
95 </data>
96 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> 131 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
97 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 132 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
98 <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter> 133 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
99 </collection> 134 </collection>
100 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> 135 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
101 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 136 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
102 <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter> 137 <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
103 </collection> 138 </collection>
104 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> 139 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
105 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 140 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
106 <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter> 141 <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
107 </collection> 142 </collection>
108 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> 143 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
109 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 144 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
110 <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter> 145 <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
111 </collection> 146 </collection>
112 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> 147 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
113 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 148 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
114 <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter> 149 <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
115 </collection> 150 </collection>
116 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> 151 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
117 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 152 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
118 <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter> 153 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
119 </collection> 154 </collection>
120 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> 155 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
121 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 156 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
122 <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter> 157 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
123 </collection> 158 </collection>
124 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> 159 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
125 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 160 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
126 <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter> 161 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
127 </collection> 162 </collection>
128 </outputs> 163 </outputs>
129 <tests> 164 <tests>
165 <!-- Note: All but one test use the non-default decompress="true"
166
167 this is because (at 11/22) Galaxy can not apply text assertions on the content
168 of compressed files https://github.com/galaxyproject/galaxy/pull/15085
169
170 So with decompress="true" more powerfull assertions are powerful.
171 A single test checks the default, ie decompress="false".
172 -->
173 <test expect_num_outputs="3">
174 <conditional name="query|subcommand">
175 <param name="download_by" value="taxon"/>
176 <param name="taxon_positional" value="human"/>
177 </conditional>
178 <param name="chromosomes" value="21"/>
179 <param name="released_before" value="01/01/2018"/>
180 <section name="file_choices">
181 <!-- include a sequence (which should be downloaded as fasta.gz)
182 and one non-sequence (which should be decompressed) output -->
183 <param name="include" value="rna,gff3"/>
184 </section>
185 <output name="genome_data_report">
186 <assert_contents>
187 <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
188 <has_n_lines n="144"/>
189 <has_n_columns n="4"/>
190 </assert_contents>
191 </output>
192 <output_collection name="rna_fasta" type="list" count="1">
193 <element name="GCF_000306695.2" decompress="true">
194 <assert_contents>
195 <has_text text=">"/>
196 </assert_contents>
197 </element>
198 </output_collection>
199 <output_collection name="genomic_gff" type="list">
200 <element name="GCF_000306695.2">
201 <assert_contents>
202 <has_n_lines min="1000000"/>
203 <has_line line="##gff-version 3"/>
204 <!-- TODO this will only work when the galaxy python packakes for 22.05 have been released
205 <has_n_columns n="9" comment="#"/> -->
206 </assert_contents>
207 </element>
208 </output_collection>
209 <assert_command>
210 <has_text text="gunzip"/>
211 </assert_command>
212 </test>
130 <test expect_num_outputs="2"> 213 <test expect_num_outputs="2">
131 <conditional name="query|subcommand"> 214 <conditional name="query|subcommand">
132 <param name="download_by" value="taxon"/> 215 <param name="download_by" value="taxon"/>
133 <param name="text_or_file" value="text"/> 216 <param name="taxon_positional" value="human"/>
134 <param name="taxon" value="human"/>
135 </conditional> 217 </conditional>
136 <param name="chromosomes" value="21"/> 218 <param name="chromosomes" value="21"/>
137 <param name="include" value=""/>
138 <param name="uncompressed" value="false"/>
139 <param name="released_before" value="01/01/2018"/>
140 <output name="archive_contents">
141 <assert_contents>
142 <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
143 </assert_contents>
144 </output>
145 </test>
146 <test expect_num_outputs="2">
147 <conditional name="query|subcommand">
148 <param name="download_by" value="taxon"/>
149 <param name="text_or_file" value="text"/>
150 <param name="taxon" value="human"/>
151 </conditional>
152 <param name="chromosomes" value="21"/>
153 <param name="include" value="genome"/>
154 <param name="uncompressed" value="true"/>
155 <param name="assembly_level" value="chromosome,complete"/> 219 <param name="assembly_level" value="chromosome,complete"/>
156 <param name="released_before" value="01/01/2018"/> 220 <param name="released_before" value="01/01/2018"/>
221 <section name="file_choices">
222 <param name="include" value="genome"/>
223 <param name="decompress" value="true"/>
224 </section>
157 <output_collection name="genome_fasta" type="list:list" count="14"> 225 <output_collection name="genome_fasta" type="list:list" count="14">
158 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> 226 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
159 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> 227 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
160 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> 228 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
161 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> 229 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
172 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> 240 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
173 </output_collection> 241 </output_collection>
174 <output name="genome_data_report"> 242 <output name="genome_data_report">
175 <assert_contents> 243 <assert_contents>
176 <has_text text="Homo sapiens"/> 244 <has_text text="Homo sapiens"/>
177 </assert_contents> 245 <has_n_columns n="4"/>
178 </output> 246 </assert_contents>
179 </test> 247 </output>
180 <!-- same as precious test but assembly_source (refseq which removes some of the genomes) --> 248 </test>
249 <!-- same as previous test but assembly_source (refseq which removes some of the genomes) -->
181 <test expect_num_outputs="2"> 250 <test expect_num_outputs="2">
182 <conditional name="query|subcommand"> 251 <conditional name="query|subcommand">
183 <param name="download_by" value="taxon"/> 252 <param name="download_by" value="taxon"/>
184 <param name="text_or_file" value="text"/> 253 <param name="taxon_positional" value="human"/>
185 <param name="taxon" value="human"/>
186 </conditional> 254 </conditional>
187 <param name="chromosomes" value="21"/> 255 <param name="chromosomes" value="21"/>
188 <param name="include" value="genome"/>
189 <param name="uncompressed" value="true"/>
190 <param name="assembly_level" value="chromosome,complete"/> 256 <param name="assembly_level" value="chromosome,complete"/>
191 <param name="assembly_source" value="refseq"/> 257 <param name="assembly_source" value="refseq"/>
192 <param name="released_before" value="01/01/2018"/> 258 <param name="released_before" value="01/01/2018"/>
259 <section name="file_choices">
260 <param name="include" value="genome"/>
261 <param name="decompress" value="true"/>
262 </section>
193 <output_collection name="genome_fasta" type="list:list" count="2"> 263 <output_collection name="genome_fasta" type="list:list" count="2">
194 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> 264 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
195 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> 265 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
196 </output_collection> 266 </output_collection>
197 <output name="genome_data_report"> 267 <output name="genome_data_report">
198 <assert_contents> 268 <assert_contents>
199 <has_text text="Homo sapiens"/> 269 <has_text text="Homo sapiens"/>
270 <has_n_lines n="5"/>
271 <has_n_columns n="4"/>
200 </assert_contents> 272 </assert_contents>
201 </output> 273 </output>
202 </test> 274 </test>
203 <test expect_num_outputs="4"> 275 <test expect_num_outputs="4">
204 <conditional name="query|subcommand"> 276 <conditional name="query|subcommand">
206 <conditional name="text_or_file"> 278 <conditional name="text_or_file">
207 <param name="text_or_file" value="text"/> 279 <param name="text_or_file" value="text"/>
208 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> 280 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
209 </conditional> 281 </conditional>
210 </conditional> 282 </conditional>
211 <param name="include" value="seq-report,gtf,cds"/>
212 <param name="uncompressed" value="true"/>
213 <param name="released_before" value="01/01/2007"/> 283 <param name="released_before" value="01/01/2007"/>
284 <section name="file_choices">
285 <param name="include" value="seq-report,gtf,cds"/>
286 <param name="decompress" value="true"/>
287 </section>
214 <output name="genome_data_report"> 288 <output name="genome_data_report">
215 <assert_contents> 289 <assert_contents>
216 <has_text text="GCF_000013305.1"/> 290 <has_text text="GCF_000013305.1"/>
217 </assert_contents> 291 <has_n_lines n="3"/>
218 </output> 292 <has_n_columns n="4"/>
293 </assert_contents>
294 </output>
295 <output_collection name="sequence_report" type="list" count="2" >
296 <element name="GCF_000007445.1">
297 <assert_contents>
298 <has_text text="GCF_000007445.1"/>
299 <has_n_lines n="2"/>
300 <has_n_columns n="14"/>
301 </assert_contents>
302 </element>
303 <element name="GCF_000013305.1">
304 <assert_contents>
305 <has_text text="GCF_000013305.1"/>
306 <has_n_lines n="2"/>
307 <has_n_columns n="14"/>
308 </assert_contents>
309 </element>
310 </output_collection>
219 <output_collection name="genomic_gtf" type="list"> 311 <output_collection name="genomic_gtf" type="list">
220 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> 312 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
221 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> 313 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
222 </output_collection> 314 </output_collection>
223 <output_collection name="genomic_cds" type="list"> 315 <output_collection name="genomic_cds" type="list">
224 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> 316 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/>
225 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> 317 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/>
226 </output_collection> 318 </output_collection>
227 </test> 319 </test>
228 <test expect_num_outputs="4"> 320 <test expect_num_outputs="4">
229 <conditional name="query|subcommand"> 321 <conditional name="query|subcommand">
230 <param name="download_by" value="accession"/> 322 <param name="download_by" value="accession"/>
231 <conditional name="text_or_file"> 323 <conditional name="text_or_file">
232 <param name="text_or_file" value="file"/> 324 <param name="text_or_file" value="file"/>
233 <param name="inputfile" value="accessions.txt"/> 325 <param name="inputfile" value="accessions.txt"/>
234 </conditional> 326 </conditional>
235 </conditional> 327 </conditional>
236 <param name="include" value="seq-report,gbff,gff3"/>
237 <param name="uncompressed" value="true"/>
238 <param name="released_before" value="01/01/2007"/> 328 <param name="released_before" value="01/01/2007"/>
239 <output name="genome_data_report"> 329 <section name="file_choices">
240 <assert_contents> 330 <param name="include" value="seq-report,gff3,gbff"/>
241 <has_text text="SAMN02604181"/> 331 <param name="decompress" value="true"/>
332 </section>
333 <output name="genome_data_report">
334 <assert_contents>
335 <has_text text="GCF_000013305.1"/>
336 <has_text text="GCF_000007445.1"/>
337 <has_n_lines n="3"/>
338 <has_n_columns n="4"/>
242 </assert_contents> 339 </assert_contents>
243 </output> 340 </output>
244 <output_collection name="genomic_gff" type="list"> 341 <output_collection name="genomic_gff" type="list">
245 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> 342 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
246 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> 343 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
248 <output_collection name="genomic_gbff" type="list"> 345 <output_collection name="genomic_gbff" type="list">
249 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> 346 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
250 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> 347 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
251 </output_collection> 348 </output_collection>
252 </test> 349 </test>
253 <test expect_num_outputs="2"> 350
351 <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
352 <test expect_num_outputs="2" expect_failure="true">
254 <conditional name="query|subcommand"> 353 <conditional name="query|subcommand">
255 <param name="download_by" value="accession"/> 354 <param name="download_by" value="accession"/>
256 <conditional name="text_or_file"> 355 <conditional name="text_or_file">
257 <param name="text_or_file" value="text"/> 356 <param name="text_or_file" value="text"/>
258 <param name="accession" value="GCF_000001405"/> 357 <param name="accession" value="GCF_000001405"/>
259 </conditional> 358 </conditional>
260 </conditional> 359 </conditional>
261 <param name="include" value="seq-report"/>
262 <param name="uncompressed" value="true"/>
263 <param name="released_before" value="01/01/2015"/> 360 <param name="released_before" value="01/01/2015"/>
264 <param name="assembly_version" value="all"/> 361 <param name="assembly_version" value="all"/>
265 <output_collection name="sequence_report" count="4"> 362 <section name="file_choices">
266 <element name="GCF_000001405.25"> 363 <param name="include" value="seq-report"/>
267 <assert_contents> 364 </section>
268 <has_text text="assignedMoleculeLocationType"/> 365 <!--
269 </assert_contents> 366 <output_collection name="sequence_report" type="list" count="4" >
270 </element> 367 -->
271 <element name="GCF_000001405.26">
272 <assert_contents>
273 <has_text text="assignedMoleculeLocationType"/>
274 </assert_contents>
275 </element>
276 <element name="GCF_000001405.27">
277 <assert_contents>
278 <has_text text="assignedMoleculeLocationType"/>
279 </assert_contents>
280 </element>
281 <element name="GCF_000001405.28">
282 <assert_contents>
283 <has_text text="assignedMoleculeLocationType"/>
284 </assert_contents>
285 </element>
286 </output_collection>
287 </test> 368 </test>
288 <test expect_num_outputs="5"> 369 <test expect_num_outputs="5">
289 <conditional name="query|subcommand"> 370 <conditional name="query|subcommand">
290 <param name="download_by" value="accession"/> 371 <param name="download_by" value="accession"/>
291 <conditional name="text_or_file"> 372 <conditional name="text_or_file">
292 <param name="text_or_file" value="text"/> 373 <param name="text_or_file" value="text"/>
293 <param name="accession" value="GCF_000146045.2"/> 374 <param name="accession" value="GCF_000146045.2"/>
294 </conditional> 375 </conditional>
295 </conditional> 376 </conditional>
296 <param name="include" value="seq-report,genome,rna,cds"/> 377 <section name="file_choices">
297 <param name="uncompressed" value="true"/> 378 <param name="include" value="genome,protein,rna,cds"/>
379 <param name="decompress" value="true"/>
380 </section>
298 <output_collection name="genome_fasta" type="list:list" count="1"> 381 <output_collection name="genome_fasta" type="list:list" count="1">
299 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> 382 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
300 </output_collection> 383 </output_collection>
384 <output_collection name="protein_fasta" type="list" count="1">
385 <element name="GCF_000146045.2" decompress="true">
386 <assert_contents>
387 <has_text text=">"/>
388 </assert_contents>
389 </element>
390 </output_collection>
391 <output_collection name="rna_fasta" type="list" count="1">
392 <element name="GCF_000146045.2" decompress="true">
393 <assert_contents>
394 <has_text text=">"/>
395 </assert_contents>
396 </element>
397 </output_collection>
398 </test>
399 <!-- same as the previous test, but use the default value for decompress,
400 see comment at the beginning of the tests -->
401 <test expect_num_outputs="5">
402 <conditional name="query|subcommand">
403 <param name="download_by" value="accession"/>
404 <conditional name="text_or_file">
405 <param name="text_or_file" value="text"/>
406 <param name="accession" value="GCF_000146045.2"/>
407 </conditional>
408 </conditional>
409 <section name="file_choices">
410 <param name="include" value="genome,protein,rna,cds"/>
411 </section>
412 <output_collection name="genome_fasta" type="list:list" count="1">
413 <element name="GCF_000146045.2">
414 <element name="GCF_000146045.2_R64" ftype="fasta.gz">
415 <assert_contents>
416 <has_size value="3843460"/>
417 </assert_contents>
418 </element>
419 </element>
420 </output_collection>
421 <output_collection name="protein_fasta" type="list" count="1">
422 <element name="GCF_000146045.2" ftype="fasta.gz">
423 <assert_contents>
424 <has_size value="1844838"/>
425 </assert_contents>
426 </element>
427 </output_collection>
428 <output_collection name="rna_fasta" type="list" count="1">
429 <element name="GCF_000146045.2" ftype="fasta.gz">
430 <assert_contents>
431 <has_size value="2784534"/>
432 </assert_contents>
433 </element>
434 </output_collection>
301 </test> 435 </test>
302 <test expect_num_outputs="3"> 436 <test expect_num_outputs="3">
303 <conditional name="query|subcommand"> 437 <conditional name="query|subcommand">
304 <param name="download_by" value="accession"/> 438 <param name="download_by" value="accession"/>
305 <conditional name="text_or_file"> 439 <conditional name="text_or_file">
306 <param name="text_or_file" value="text"/> 440 <param name="text_or_file" value="text"/>
307 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> 441 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
308 </conditional> 442 </conditional>
309 </conditional> 443 </conditional>
310 <param name="include" value="seq-report,genome"/> 444 <section name="file_choices">
311 <param name="uncompressed" value="true"/> 445 <param name="include" value="seq-report,genome"/>
446 <param name="decompress" value="true"/>
447 </section>
448 <output_collection name="sequence_report" type="list" count="2"/>
312 <output_collection name="genome_fasta" type="list:list" count="2"> 449 <output_collection name="genome_fasta" type="list:list" count="2">
313 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> 450 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
314 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> 451 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
315 </output_collection> 452 </output_collection>
316 </test> 453 </test>
318 https://github.com/ncbi/datasets/issues/187 455 https://github.com/ncbi/datasets/issues/187
319 hence we set expect_test_failure="true"--> 456 hence we set expect_test_failure="true"-->
320 <test expect_num_outputs="1" expect_test_failure="true"> 457 <test expect_num_outputs="1" expect_test_failure="true">
321 <conditional name="query|subcommand"> 458 <conditional name="query|subcommand">
322 <param name="download_by" value="taxon"/> 459 <param name="download_by" value="taxon"/>
323 <param name="text_or_file" value="text"/> 460 <param name="taxon_positional" value="4932"/>
324 <param name="taxon" value="4932"/>
325 <param name="tax_exact_match" value="true"/> 461 <param name="tax_exact_match" value="true"/>
326 </conditional> 462 </conditional>
327 <param name="include" value=""/> 463 <param name="include" value=""/>
328 <param name="uncompressed" value="true"/>
329 <output name="genome_data_report"> 464 <output name="genome_data_report">
330 <assert_contents> 465 <assert_contents>
331 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> 466 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
332 </assert_contents> 467 </assert_contents>
333 </output> 468 </output>
336 <help> 471 <help>
337 <![CDATA[ 472 <![CDATA[
338 **Download Genome Datasets from NCBI** 473 **Download Genome Datasets from NCBI**
339 474
340 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. 475 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
341 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. 476 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
342 477
343 Tthe default genome dataset includes the following files (if available): 478 The download is a three step process:
344 * data_report.jsonl (genome assembly and annotation metadata, not always available) 479
345 * genomic.fna (genomic sequences) 480 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
346 * rna.fna (transcript sequences) 481 2. The metadata is transformed into a tabular (TSV) file
347 * protein.faa (protein sequences) 482 3. The data is hydrated (the actual data is downloaded)
348 * genomic.gff (genome annotation in gff3 format) 483
349 * dataset_catalog.json (a list of files and file types included in the dataset) 484 The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
485 Thereby its possible to inspect the metadata prior to the actual data download. Also this
486 allows to use the tool for querying data sets (and their accessions) of interest which
487 can then be downloaded in a second call using the accessions.
350 ]]> 488 ]]>
351 </help> 489 </help>
352 490 <expand macro="citations"/>
353 </tool> 491 </tool>