Mercurial > repos > iuc > ncbi_datasets
comparison datasets_genome.xml @ 20:35d32c807c23 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
| author | iuc |
|---|---|
| date | Fri, 26 Dec 2025 17:16:51 +0000 |
| parents | 9a10a6449901 |
| children |
comparison
equal
deleted
inserted
replaced
| 19:ced734560c9d | 20:35d32c807c23 |
|---|---|
| 2 <description>download genome sequence, annotation and metadata</description> | 2 <description>download genome sequence, annotation and metadata</description> |
| 3 <macros> | 3 <macros> |
| 4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
| 5 </macros> | 5 </macros> |
| 6 <expand macro="bio_tools"/> | 6 <expand macro="bio_tools"/> |
| 7 <expand macro="requirements"></expand> | 7 <expand macro="requirements"/> |
| 8 <expand macro="version_command"/> | 8 <expand macro="version_command"/> |
| 9 <command><![CDATA[ | 9 <stdio> |
| 10 <regex match="Warning" source="stderr" level="warning" description=""/> | |
| 11 <regex match="skipping" source="stderr" level="warning" description=""/> | |
| 12 <regex match="ERROR" level="fatal"/> | |
| 13 </stdio> | |
| 14 <command detect_errors="exit_code"><![CDATA[ | |
| 10 #import re | 15 #import re |
| 11 @SETUP_CERTIFICATES@ | 16 @SETUP_CERTIFICATES@ |
| 12 datasets download genome $query.subcommand.download_by | 17 datasets download genome $query.subcommand.download_by |
| 13 #if $query.subcommand.download_by == 'accession': | 18 #if $query.subcommand.download_by == 'accession': |
| 14 #if $query.subcommand.text_or_file.text_or_file == 'text': | 19 #if $query.subcommand.text_or_file.text_or_file == 'text': |
| 39 | 44 |
| 40 @INCLUDE@ | 45 @INCLUDE@ |
| 41 @RELEASED_BEFORE@ | 46 @RELEASED_BEFORE@ |
| 42 @RELEASED_AFTER@ | 47 @RELEASED_AFTER@ |
| 43 #for search_term in $filters.search: | 48 #for search_term in $filters.search: |
| 44 --search '$filters.search_term' | 49 --search '$search_term.search' |
| 45 #end for | 50 #end for |
| 46 --no-progressbar | 51 --no-progressbar |
| 47 --dehydrated | 52 --dehydrated |
| 48 | 53 |
| 49 ## produce TSV report file | 54 ## produce TSV report file |
| 114 <option value="only" selected="false">Limit to MAGs</option> | 119 <option value="only" selected="false">Limit to MAGs</option> |
| 115 <option value="exclude" selected="false">Exclude MAGs</option> | 120 <option value="exclude" selected="false">Exclude MAGs</option> |
| 116 </param> | 121 </param> |
| 117 <expand macro="released_options"/> | 122 <expand macro="released_options"/> |
| 118 <expand macro="released_options" before_or_after="after"/> | 123 <expand macro="released_options" before_or_after="after"/> |
| 119 | |
| 120 <repeat name="search" title="Add search terms"> | 124 <repeat name="search" title="Add search terms"> |
| 121 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> | 125 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> |
| 122 </repeat> | 126 </repeat> |
| 123 </section> | 127 </section> |
| 124 <section name="file_choices" title="Output options" expanded="true"> | 128 <section name="file_choices" title="Output options" expanded="true"> |
| 135 </section> | 139 </section> |
| 136 </inputs> | 140 </inputs> |
| 137 <outputs> | 141 <outputs> |
| 138 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> | 142 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> |
| 139 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> | 143 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> |
| 140 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 144 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 141 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> | 145 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> |
| 142 </collection> | 146 </collection> |
| 143 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> | 147 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> |
| 144 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 148 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 145 <filter>file_choices['include'] and "genome" in file_choices['include']</filter> | 149 <filter>file_choices['include'] and "genome" in file_choices['include']</filter> |
| 146 </collection> | 150 </collection> |
| 147 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> | 151 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> |
| 148 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 152 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 149 <filter>file_choices['include'] and "rna" in file_choices['include']</filter> | 153 <filter>file_choices['include'] and "rna" in file_choices['include']</filter> |
| 150 </collection> | 154 </collection> |
| 151 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> | 155 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> |
| 152 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 156 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 153 <filter>file_choices['include'] and "protein" in file_choices['include']</filter> | 157 <filter>file_choices['include'] and "protein" in file_choices['include']</filter> |
| 154 </collection> | 158 </collection> |
| 155 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> | 159 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> |
| 156 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 160 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 157 <filter>file_choices['include'] and "cds" in file_choices['include']</filter> | 161 <filter>file_choices['include'] and "cds" in file_choices['include']</filter> |
| 158 </collection> | 162 </collection> |
| 159 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> | 163 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> |
| 160 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 164 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 161 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> | 165 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> |
| 162 </collection> | 166 </collection> |
| 163 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> | 167 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> |
| 164 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 168 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 165 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> | 169 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> |
| 166 </collection> | 170 </collection> |
| 167 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> | 171 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> |
| 168 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 172 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 169 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> | 173 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> |
| 170 </collection> | 174 </collection> |
| 171 </outputs> | 175 </outputs> |
| 172 <tests> | 176 <tests> |
| 173 <test expect_num_outputs="3"> | 177 <test expect_num_outputs="3"> |
| 174 <conditional name="query|subcommand"> | 178 <conditional name="query|subcommand"> |
| 175 <param name="download_by" value="taxon"/> | 179 <param name="download_by" value="taxon"/> |
| 176 <param name="taxon_positional" value="human"/> | 180 <param name="taxon_positional" value="human"/> |
| 177 </conditional> | 181 </conditional> |
| 178 <param name="chromosomes" value="21"/> | 182 <section name="filters"> |
| 179 <param name="released_before" value="01/01/2018"/> | 183 <param name="chromosomes" value="21"/> |
| 184 <param name="released_before" value="01/01/2018"/> | |
| 185 </section> | |
| 180 <section name="file_choices"> | 186 <section name="file_choices"> |
| 181 <!-- include a sequence (which should be downloaded as fasta.gz) | 187 <!-- include a sequence (which should be downloaded as fasta.gz) |
| 182 and one non-sequence (which should be decompressed) output --> | 188 and one non-sequence (which should be decompressed) output --> |
| 183 <param name="include" value="rna,gff3"/> | 189 <param name="include" value="rna,gff3"/> |
| 184 </section> | 190 </section> |
| 185 <output name="genome_data_report"> | 191 <output name="genome_data_report"> |
| 186 <assert_contents> | 192 <assert_contents> |
| 187 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> | 193 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> |
| 188 <has_n_lines n="142"/> | 194 <has_n_lines min="140"/> |
| 189 <has_n_columns n="4"/> | 195 <has_n_columns n="4"/> |
| 190 </assert_contents> | 196 </assert_contents> |
| 191 </output> | 197 </output> |
| 192 <output_collection name="rna_fasta" type="list" count="1"> | 198 <output_collection name="rna_fasta" type="list"> |
| 193 <element name="GCF_000306695.2" decompress="true"> | 199 <element name="GCF_000306695.2" decompress="true"> |
| 194 <assert_contents> | 200 <assert_contents> |
| 195 <has_text text=">"/> | 201 <has_text text=">"/> |
| 196 </assert_contents> | 202 </assert_contents> |
| 197 </element> | 203 </element> |
| 198 </output_collection> | 204 </output_collection> |
| 199 <output_collection name="genomic_gff" type="list"> | 205 <output_collection name="genomic_gff" type="list"> |
| 200 <element name="GCF_000306695.2"> | 206 <element name="GCF_000306695.2"> |
| 210 </assert_command> | 216 </assert_command> |
| 211 </test> | 217 </test> |
| 212 <test expect_num_outputs="2"> | 218 <test expect_num_outputs="2"> |
| 213 <conditional name="query|subcommand"> | 219 <conditional name="query|subcommand"> |
| 214 <param name="download_by" value="taxon"/> | 220 <param name="download_by" value="taxon"/> |
| 215 <param name="taxon_positional" value="human"/> | 221 <param name="taxon_positional" value="Norway rat"/> |
| 216 </conditional> | 222 </conditional> |
| 217 <param name="chromosomes" value="21"/> | 223 <section name="filters"> |
| 218 <param name="assembly_level" value="chromosome,complete"/> | 224 <param name="chromosomes" value="MT"/> |
| 219 <param name="released_before" value="01/01/2018"/> | 225 </section> |
| 220 <section name="file_choices"> | 226 <section name="file_choices"> |
| 221 <param name="include" value="genome"/> | 227 <param name="include" value="genome"/> |
| 222 <param name="decompress" value="true"/> | 228 <param name="decompress" value="true"/> |
| 223 </section> | 229 </section> |
| 224 <output_collection name="genome_fasta" type="list:list" count="12"> | 230 <output_collection name="genome_fasta" type="list:list" count="9"> |
| 225 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> | 231 <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression=">"/> |
| 226 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> | 232 <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression=">"/> |
| 227 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> | 233 <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression=">"/> |
| 228 <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/> | 234 <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression=">"/> |
| 229 <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/> | 235 <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression=">"/> |
| 230 <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/> | 236 <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression=">"/> |
| 231 <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/> | 237 <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression=">"/> |
| 232 <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/> | 238 <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression=">"/> |
| 233 <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/> | 239 <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression=">"/> |
| 234 <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/> | |
| 235 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> | |
| 236 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> | |
| 237 <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 --> | 240 <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 --> |
| 238 <!-- | 241 <!-- |
| 239 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/> | 242 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/> |
| 240 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> | 243 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> |
| 241 --> | 244 --> |
| 242 </output_collection> | 245 </output_collection> |
| 243 <output name="genome_data_report"> | 246 <output name="genome_data_report"> |
| 244 <assert_contents> | 247 <assert_contents> |
| 245 <has_text text="Homo sapiens"/> | 248 <has_text text="Rattus norvegicus"/> |
| 246 <has_n_columns n="4"/> | 249 <has_n_columns n="4"/> |
| 247 </assert_contents> | 250 </assert_contents> |
| 248 </output> | 251 </output> |
| 249 </test> | 252 </test> |
| 250 <!-- same as previous test but assembly_source=refseq, which removes all of the genomes --> | 253 <!-- same as previous test but assembly_source=refseq, which removes all of the genomes --> |
| 251 <test expect_failure="true"> | 254 <test expect_failure="true"> |
| 252 <conditional name="query|subcommand"> | 255 <conditional name="query|subcommand"> |
| 253 <param name="download_by" value="taxon"/> | 256 <param name="download_by" value="taxon"/> |
| 254 <param name="taxon_positional" value="human"/> | 257 <param name="taxon_positional" value="human"/> |
| 255 </conditional> | 258 </conditional> |
| 256 <param name="chromosomes" value="21"/> | 259 <section name="filters"> |
| 257 <param name="assembly_level" value="chromosome,complete"/> | 260 <param name="chromosomes" value="21"/> |
| 258 <param name="assembly_source" value="refseq"/> | 261 <param name="assembly_level" value="chromosome,complete"/> |
| 259 <param name="released_before" value="01/01/2018"/> | 262 <param name="assembly_source" value="refseq"/> |
| 263 <param name="released_before" value="01/01/2018"/> | |
| 264 </section> | |
| 260 <section name="file_choices"> | 265 <section name="file_choices"> |
| 261 <param name="include" value="genome"/> | 266 <param name="include" value="genome"/> |
| 262 <param name="decompress" value="true"/> | 267 <param name="decompress" value="true"/> |
| 263 </section> | 268 </section> |
| 264 <assert_stderr> | 269 <assert_stderr> |
| 286 <conditional name="text_or_file"> | 291 <conditional name="text_or_file"> |
| 287 <param name="text_or_file" value="text"/> | 292 <param name="text_or_file" value="text"/> |
| 288 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> | 293 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> |
| 289 </conditional> | 294 </conditional> |
| 290 </conditional> | 295 </conditional> |
| 291 <param name="released_before" value="01/01/2007"/> | 296 <section name="filters"> |
| 297 <param name="released_before" value="01/01/2007"/> | |
| 298 </section> | |
| 292 <section name="file_choices"> | 299 <section name="file_choices"> |
| 293 <param name="include" value="seq-report,gtf,cds"/> | 300 <param name="include" value="seq-report,gtf,cds"/> |
| 294 <param name="decompress" value="true"/> | 301 <param name="decompress" value="true"/> |
| 295 </section> | 302 </section> |
| 296 <output name="genome_data_report"> | 303 <output name="genome_data_report"> |
| 298 <has_text text="GCF_000013305.1"/> | 305 <has_text text="GCF_000013305.1"/> |
| 299 <has_n_lines n="3"/> | 306 <has_n_lines n="3"/> |
| 300 <has_n_columns n="4"/> | 307 <has_n_columns n="4"/> |
| 301 </assert_contents> | 308 </assert_contents> |
| 302 </output> | 309 </output> |
| 303 <output_collection name="sequence_report" type="list" count="2" > | 310 <output_collection name="sequence_report" type="list" count="2"> |
| 304 <element name="GCF_000007445.1"> | 311 <element name="GCF_000007445.1"> |
| 305 <assert_contents> | 312 <assert_contents> |
| 306 <has_text text="GCF_000007445.1"/> | 313 <has_text text="GCF_000007445.1"/> |
| 307 <has_n_lines n="2"/> | 314 <has_n_lines n="2"/> |
| 308 <has_n_columns n="15"/> | 315 <has_n_columns n="15"/> |
| 314 <has_n_lines n="2"/> | 321 <has_n_lines n="2"/> |
| 315 <has_n_columns n="15"/> | 322 <has_n_columns n="15"/> |
| 316 </assert_contents> | 323 </assert_contents> |
| 317 </element> | 324 </element> |
| 318 </output_collection> | 325 </output_collection> |
| 319 <output_collection name="genomic_gtf" type="list"> | 326 <output_collection name="genomic_gtf" type="list" count="2"> |
| 320 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> | 327 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> |
| 321 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> | 328 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> |
| 322 </output_collection> | 329 </output_collection> |
| 323 <output_collection name="genomic_cds" type="list"> | 330 <output_collection name="genomic_cds" type="list"> |
| 324 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> | 331 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> |
| 331 <conditional name="text_or_file"> | 338 <conditional name="text_or_file"> |
| 332 <param name="text_or_file" value="file"/> | 339 <param name="text_or_file" value="file"/> |
| 333 <param name="inputfile" value="accessions.txt"/> | 340 <param name="inputfile" value="accessions.txt"/> |
| 334 </conditional> | 341 </conditional> |
| 335 </conditional> | 342 </conditional> |
| 336 <param name="released_before" value="01/01/2007"/> | 343 <section name="filters"> |
| 344 <param name="released_before" value="01/01/2007"/> | |
| 345 </section> | |
| 337 <section name="file_choices"> | 346 <section name="file_choices"> |
| 338 <param name="include" value="seq-report,gff3,gbff"/> | 347 <param name="include" value="seq-report,gff3,gbff"/> |
| 339 <param name="decompress" value="true"/> | 348 <param name="decompress" value="true"/> |
| 340 </section> | 349 </section> |
| 341 <output name="genome_data_report"> | 350 <output name="genome_data_report"> |
| 353 <output_collection name="genomic_gbff" type="list"> | 362 <output_collection name="genomic_gbff" type="list"> |
| 354 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> | 363 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> |
| 355 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> | 364 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> |
| 356 </output_collection> | 365 </output_collection> |
| 357 </test> | 366 </test> |
| 358 | |
| 359 <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> | 367 <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> |
| 360 <test expect_num_outputs="2"> | 368 <test expect_num_outputs="2"> |
| 361 <conditional name="query|subcommand"> | 369 <conditional name="query|subcommand"> |
| 362 <param name="download_by" value="accession"/> | 370 <param name="download_by" value="accession"/> |
| 363 <conditional name="text_or_file"> | 371 <conditional name="text_or_file"> |
| 364 <param name="text_or_file" value="text"/> | 372 <param name="text_or_file" value="text"/> |
| 365 <param name="accession" value="GCF_000001405"/> | 373 <param name="accession" value="GCF_000001405"/> |
| 366 </conditional> | 374 </conditional> |
| 367 </conditional> | 375 </conditional> |
| 368 <param name="released_before" value="01/01/2015"/> | 376 <section name="filters"> |
| 369 <param name="assembly_version" value="all"/> | 377 <param name="released_before" value="01/01/2015"/> |
| 378 <param name="assembly_version" value="all"/> | |
| 379 </section> | |
| 370 <section name="file_choices"> | 380 <section name="file_choices"> |
| 371 <param name="include" value="seq-report"/> | 381 <param name="include" value="seq-report"/> |
| 372 </section> | 382 </section> |
| 373 <output name="genome_data_report"> | 383 <output name="genome_data_report"> |
| 374 <!-- assert that we get at least the 16 versions available at the time of writing this test --> | 384 <!-- assert that we get at least the 16 versions available at the time of writing this test --> |
| 393 <section name="file_choices"> | 403 <section name="file_choices"> |
| 394 <param name="include" value="genome,protein,rna,cds"/> | 404 <param name="include" value="genome,protein,rna,cds"/> |
| 395 <param name="decompress" value="true"/> | 405 <param name="decompress" value="true"/> |
| 396 </section> | 406 </section> |
| 397 <output_collection name="genome_fasta" type="list:list" count="1"> | 407 <output_collection name="genome_fasta" type="list:list" count="1"> |
| 398 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> | 408 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> |
| 399 </output_collection> | 409 </output_collection> |
| 400 <output_collection name="protein_fasta" type="list" count="1"> | 410 <output_collection name="protein_fasta" type="list" count="1"> |
| 401 <element name="GCF_000146045.2" decompress="true"> | 411 <element name="GCF_000146045.2" decompress="true"> |
| 402 <assert_contents> | 412 <assert_contents> |
| 403 <has_text text=">"/> | 413 <has_text text=">"/> |
| 404 </assert_contents> | 414 </assert_contents> |
| 405 </element> | 415 </element> |
| 406 </output_collection> | 416 </output_collection> |
| 407 <output_collection name="rna_fasta" type="list" count="1"> | 417 <output_collection name="rna_fasta" type="list" count="1"> |
| 408 <element name="GCF_000146045.2" decompress="true"> | 418 <element name="GCF_000146045.2" decompress="true"> |
| 409 <assert_contents> | 419 <assert_contents> |
| 410 <has_text text=">"/> | 420 <has_text text=">"/> |
| 411 </assert_contents> | 421 </assert_contents> |
| 412 </element> | 422 </element> |
| 413 </output_collection> | 423 </output_collection> |
| 414 </test> | 424 </test> |
| 415 <!-- same as the previous test, but use the default value for decompress, | 425 <!-- same as the previous test, but use the default value for decompress, |
| 435 </element> | 445 </element> |
| 436 </output_collection> | 446 </output_collection> |
| 437 <output_collection name="protein_fasta" type="list" count="1"> | 447 <output_collection name="protein_fasta" type="list" count="1"> |
| 438 <element name="GCF_000146045.2" ftype="fasta.gz"> | 448 <element name="GCF_000146045.2" ftype="fasta.gz"> |
| 439 <assert_contents> | 449 <assert_contents> |
| 440 <has_size value="1845038" delta="2000"/> | 450 <has_size value="1847862" delta="2000"/> |
| 441 </assert_contents> | 451 </assert_contents> |
| 442 </element> | 452 </element> |
| 443 </output_collection> | 453 </output_collection> |
| 444 <output_collection name="rna_fasta" type="list" count="1"> | 454 <output_collection name="rna_fasta" type="list" count="1"> |
| 445 <element name="GCF_000146045.2" ftype="fasta.gz"> | 455 <element name="GCF_000146045.2" ftype="fasta.gz"> |
| 461 <param name="include" value="seq-report,genome"/> | 471 <param name="include" value="seq-report,genome"/> |
| 462 <param name="decompress" value="true"/> | 472 <param name="decompress" value="true"/> |
| 463 </section> | 473 </section> |
| 464 <output_collection name="sequence_report" type="list" count="2"/> | 474 <output_collection name="sequence_report" type="list" count="2"/> |
| 465 <output_collection name="genome_fasta" type="list:list" count="2"> | 475 <output_collection name="genome_fasta" type="list:list" count="2"> |
| 466 <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> | 476 <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> |
| 467 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> | 477 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> |
| 468 </output_collection> | 478 </output_collection> |
| 469 </test> | 479 </test> |
| 470 <!-- tax_exact_match should filter out strains | 480 <!-- tax_exact_match should filter out strains |
| 471 https://github.com/ncbi/datasets/issues/187 --> | 481 https://github.com/ncbi/datasets/issues/187 --> |
| 472 <test expect_num_outputs="1"> | 482 <test expect_num_outputs="2"> |
| 473 <conditional name="query|subcommand"> | 483 <conditional name="query|subcommand"> |
| 474 <param name="download_by" value="taxon"/> | 484 <param name="download_by" value="taxon"/> |
| 475 <param name="taxon_positional" value="4932"/> | 485 <param name="taxon_positional" value="4932"/> |
| 476 <param name="tax_exact_match" value="true"/> | 486 <param name="tax_exact_match" value="true"/> |
| 477 </conditional> | 487 </conditional> |
| 478 <param name="include" value=""/> | 488 <output name="genome_data_report"> |
| 479 <output name="genome_data_report"> | 489 <assert_contents> |
| 480 <assert_contents> | 490 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> |
| 481 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> | 491 </assert_contents> |
| 492 </output> | |
| 493 </test> | |
| 494 <!-- test search filter --> | |
| 495 <test expect_num_outputs="1"> | |
| 496 <conditional name="query|subcommand"> | |
| 497 <param name="download_by" value="taxon"/> | |
| 498 <param name="taxon_positional" value="Streptococcus"/> | |
| 499 </conditional> | |
| 500 <section name="filters"> | |
| 501 <repeat name="search"> | |
| 502 <param name="search" value="pyogenes"/> | |
| 503 </repeat> | |
| 504 </section> | |
| 505 <section name="file_choices"> | |
| 506 <param name="include" value_json="null"/> | |
| 507 </section> | |
| 508 <output name="genome_data_report"> | |
| 509 <assert_contents> | |
| 510 <has_text text="pyogenes"/> | |
| 482 </assert_contents> | 511 </assert_contents> |
| 483 </output> | 512 </output> |
| 484 </test> | 513 </test> |
| 485 </tests> | 514 </tests> |
| 486 <help> | 515 <help><![CDATA[ |
| 487 <![CDATA[ | 516 .. class:: infomark |
| 488 **Download Genome Datasets from NCBI** | 517 |
| 489 | 518 **What it does** |
| 490 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. | 519 |
| 491 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. | 520 Downloads genome assemblies from NCBI using the `datasets`_ command-line tool. |
| 492 | 521 Retrieve genome sequences, annotations, and metadata by accession or taxon. |
| 493 The download is a three step process: | 522 |
| 494 | 523 **Query Options** |
| 495 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) | 524 |
| 496 2. The metadata is transformed into a tabular (TSV) file | 525 - **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession |
| 497 3. The data is hydrated (the actual data is downloaded) | 526 - **By Taxon**: Taxonomy ID, scientific name, or common name |
| 498 | 527 |
| 499 The 3rd step can be skipped by unselecting all output types in the `Include` parameter. | 528 **Filters** |
| 500 Thereby its possible to inspect the metadata prior to the actual data download. Also this | 529 |
| 501 allows to use the tool for querying data sets (and their accessions) of interest which | 530 ==================== =============================================== |
| 502 can then be downloaded in a second call using the accessions. | 531 Filter Description |
| 503 ]]> | 532 ==================== =============================================== |
| 504 </help> | 533 Reference only Limit to reference/representative assemblies |
| 534 Annotated only Include only genomes with annotations | |
| 535 Assembly level Chromosome, complete, contig, or scaffold | |
| 536 Assembly source RefSeq (GCF\_) or GenBank (GCA\_) | |
| 537 Exclude atypical Remove atypical assemblies (e.g., partial) | |
| 538 MAG filter Include/exclude metagenome-assembled genomes | |
| 539 Date range Filter by release date | |
| 540 ==================== =============================================== | |
| 541 | |
| 542 ---- | |
| 543 | |
| 544 .. class:: warningmark | |
| 545 | |
| 546 **Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies. | |
| 547 If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results | |
| 548 with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug). | |
| 549 | |
| 550 **Outputs** | |
| 551 | |
| 552 - **Data Report**: Tabular metadata for matching assemblies | |
| 553 - **Genome FASTA**: Genomic sequences (nested collection by accession) | |
| 554 - **Annotation files**: GFF3, GTF, GenBank flat files | |
| 555 - **Protein/RNA/CDS**: Amino acid and nucleotide sequences | |
| 556 - **Sequence Report**: Per-sequence metadata (chromosome, length, etc.) | |
| 557 | |
| 558 .. _datasets: https://www.ncbi.nlm.nih.gov/datasets/ | |
| 559 | |
| 560 ]]></help> | |
| 505 <expand macro="citations"/> | 561 <expand macro="citations"/> |
| 506 </tool> | 562 </tool> |
