Mercurial > repos > iuc > ncbi_datasets
diff datasets_genome.xml @ 20:35d32c807c23 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
| author | iuc |
|---|---|
| date | Fri, 26 Dec 2025 17:16:51 +0000 |
| parents | 9a10a6449901 |
| children |
line wrap: on
line diff
--- a/datasets_genome.xml Mon Mar 17 11:05:34 2025 +0000 +++ b/datasets_genome.xml Fri Dec 26 17:16:51 2025 +0000 @@ -4,9 +4,14 @@ <import>macros.xml</import> </macros> <expand macro="bio_tools"/> - <expand macro="requirements"></expand> + <expand macro="requirements"/> <expand macro="version_command"/> - <command><![CDATA[ + <stdio> + <regex match="Warning" source="stderr" level="warning" description=""/> + <regex match="skipping" source="stderr" level="warning" description=""/> + <regex match="ERROR" level="fatal"/> + </stdio> + <command detect_errors="exit_code"><![CDATA[ #import re @SETUP_CERTIFICATES@ datasets download genome $query.subcommand.download_by @@ -41,7 +46,7 @@ @RELEASED_BEFORE@ @RELEASED_AFTER@ #for search_term in $filters.search: - --search '$filters.search_term' + --search '$search_term.search' #end for --no-progressbar --dehydrated @@ -116,7 +121,6 @@ </param> <expand macro="released_options"/> <expand macro="released_options" before_or_after="after"/> - <repeat name="search" title="Add search terms"> <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> </repeat> @@ -137,35 +141,35 @@ <outputs> <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> </collection> <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> - <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "genome" in file_choices['include']</filter> </collection> <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "rna" in file_choices['include']</filter> </collection> <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "protein" in file_choices['include']</filter> </collection> <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "cds" in file_choices['include']</filter> </collection> <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> </collection> <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> </collection> <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> </collection> </outputs> @@ -175,8 +179,10 @@ <param name="download_by" value="taxon"/> <param name="taxon_positional" value="human"/> </conditional> - <param name="chromosomes" value="21"/> - <param name="released_before" value="01/01/2018"/> + <section name="filters"> + <param name="chromosomes" value="21"/> + <param name="released_before" value="01/01/2018"/> + </section> <section name="file_choices"> <!-- include a sequence (which should be downloaded as fasta.gz) and one non-sequence (which should be decompressed) output --> @@ -184,15 +190,15 @@ </section> <output name="genome_data_report"> <assert_contents> - <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> - <has_n_lines n="142"/> + <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> + <has_n_lines min="140"/> <has_n_columns n="4"/> </assert_contents> </output> - <output_collection name="rna_fasta" type="list" count="1"> + <output_collection name="rna_fasta" type="list"> <element name="GCF_000306695.2" decompress="true"> <assert_contents> - <has_text text=">"/> + <has_text text=">"/> </assert_contents> </element> </output_collection> @@ -212,28 +218,25 @@ <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> - <param name="taxon_positional" value="human"/> + <param name="taxon_positional" value="Norway rat"/> </conditional> - <param name="chromosomes" value="21"/> - <param name="assembly_level" value="chromosome,complete"/> - <param name="released_before" value="01/01/2018"/> + <section name="filters"> + <param name="chromosomes" value="MT"/> + </section> <section name="file_choices"> <param name="include" value="genome"/> <param name="decompress" value="true"/> </section> - <output_collection name="genome_fasta" type="list:list" count="12"> - <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> - <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> + <output_collection name="genome_fasta" type="list:list" count="9"> + <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression=">"/> + <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression=">"/> <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 --> <!-- <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/> @@ -242,7 +245,7 @@ </output_collection> <output name="genome_data_report"> <assert_contents> - <has_text text="Homo sapiens"/> + <has_text text="Rattus norvegicus"/> <has_n_columns n="4"/> </assert_contents> </output> @@ -253,10 +256,12 @@ <param name="download_by" value="taxon"/> <param name="taxon_positional" value="human"/> </conditional> - <param name="chromosomes" value="21"/> - <param name="assembly_level" value="chromosome,complete"/> - <param name="assembly_source" value="refseq"/> - <param name="released_before" value="01/01/2018"/> + <section name="filters"> + <param name="chromosomes" value="21"/> + <param name="assembly_level" value="chromosome,complete"/> + <param name="assembly_source" value="refseq"/> + <param name="released_before" value="01/01/2018"/> + </section> <section name="file_choices"> <param name="include" value="genome"/> <param name="decompress" value="true"/> @@ -288,7 +293,9 @@ <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> </conditional> </conditional> - <param name="released_before" value="01/01/2007"/> + <section name="filters"> + <param name="released_before" value="01/01/2007"/> + </section> <section name="file_choices"> <param name="include" value="seq-report,gtf,cds"/> <param name="decompress" value="true"/> @@ -300,7 +307,7 @@ <has_n_columns n="4"/> </assert_contents> </output> - <output_collection name="sequence_report" type="list" count="2" > + <output_collection name="sequence_report" type="list" count="2"> <element name="GCF_000007445.1"> <assert_contents> <has_text text="GCF_000007445.1"/> @@ -316,7 +323,7 @@ </assert_contents> </element> </output_collection> - <output_collection name="genomic_gtf" type="list"> + <output_collection name="genomic_gtf" type="list" count="2"> <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> </output_collection> @@ -333,7 +340,9 @@ <param name="inputfile" value="accessions.txt"/> </conditional> </conditional> - <param name="released_before" value="01/01/2007"/> + <section name="filters"> + <param name="released_before" value="01/01/2007"/> + </section> <section name="file_choices"> <param name="include" value="seq-report,gff3,gbff"/> <param name="decompress" value="true"/> @@ -355,7 +364,6 @@ <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> </output_collection> </test> - <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> <test expect_num_outputs="2"> <conditional name="query|subcommand"> @@ -365,8 +373,10 @@ <param name="accession" value="GCF_000001405"/> </conditional> </conditional> - <param name="released_before" value="01/01/2015"/> - <param name="assembly_version" value="all"/> + <section name="filters"> + <param name="released_before" value="01/01/2015"/> + <param name="assembly_version" value="all"/> + </section> <section name="file_choices"> <param name="include" value="seq-report"/> </section> @@ -395,19 +405,19 @@ <param name="decompress" value="true"/> </section> <output_collection name="genome_fasta" type="list:list" count="1"> - <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> + <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> </output_collection> <output_collection name="protein_fasta" type="list" count="1"> <element name="GCF_000146045.2" decompress="true"> <assert_contents> - <has_text text=">"/> + <has_text text=">"/> </assert_contents> </element> </output_collection> <output_collection name="rna_fasta" type="list" count="1"> <element name="GCF_000146045.2" decompress="true"> <assert_contents> - <has_text text=">"/> + <has_text text=">"/> </assert_contents> </element> </output_collection> @@ -437,7 +447,7 @@ <output_collection name="protein_fasta" type="list" count="1"> <element name="GCF_000146045.2" ftype="fasta.gz"> <assert_contents> - <has_size value="1845038" delta="2000"/> + <has_size value="1847862" delta="2000"/> </assert_contents> </element> </output_collection> @@ -463,44 +473,90 @@ </section> <output_collection name="sequence_report" type="list" count="2"/> <output_collection name="genome_fasta" type="list:list" count="2"> - <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> - <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> + <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> + <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> </output_collection> </test> <!-- tax_exact_match should filter out strains https://github.com/ncbi/datasets/issues/187 --> - <test expect_num_outputs="1"> + <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="4932"/> <param name="tax_exact_match" value="true"/> </conditional> - <param name="include" value=""/> <output name="genome_data_report"> <assert_contents> - <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> + <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> + </assert_contents> + </output> + </test> + <!-- test search filter --> + <test expect_num_outputs="1"> + <conditional name="query|subcommand"> + <param name="download_by" value="taxon"/> + <param name="taxon_positional" value="Streptococcus"/> + </conditional> + <section name="filters"> + <repeat name="search"> + <param name="search" value="pyogenes"/> + </repeat> + </section> + <section name="file_choices"> + <param name="include" value_json="null"/> + </section> + <output name="genome_data_report"> + <assert_contents> + <has_text text="pyogenes"/> </assert_contents> </output> </test> </tests> - <help> -<![CDATA[ -**Download Genome Datasets from NCBI** + <help><![CDATA[ +.. class:: infomark + +**What it does** -Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. -Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. +Downloads genome assemblies from NCBI using the `datasets`_ command-line tool. +Retrieve genome sequences, annotations, and metadata by accession or taxon. + +**Query Options** -The download is a three step process: +- **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession +- **By Taxon**: Taxonomy ID, scientific name, or common name + +**Filters** -1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) -2. The metadata is transformed into a tabular (TSV) file -3. The data is hydrated (the actual data is downloaded) +==================== =============================================== +Filter Description +==================== =============================================== +Reference only Limit to reference/representative assemblies +Annotated only Include only genomes with annotations +Assembly level Chromosome, complete, contig, or scaffold +Assembly source RefSeq (GCF\_) or GenBank (GCA\_) +Exclude atypical Remove atypical assemblies (e.g., partial) +MAG filter Include/exclude metagenome-assembled genomes +Date range Filter by release date +==================== =============================================== + +---- + +.. class:: warningmark -The 3rd step can be skipped by unselecting all output types in the `Include` parameter. -Thereby its possible to inspect the metadata prior to the actual data download. Also this -allows to use the tool for querying data sets (and their accessions) of interest which -can then be downloaded in a second call using the accessions. -]]> - </help> +**Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies. +If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results +with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug). + +**Outputs** + +- **Data Report**: Tabular metadata for matching assemblies +- **Genome FASTA**: Genomic sequences (nested collection by accession) +- **Annotation files**: GFF3, GTF, GenBank flat files +- **Protein/RNA/CDS**: Amino acid and nucleotide sequences +- **Sequence Report**: Per-sequence metadata (chromosome, length, etc.) + +.. _datasets: https://www.ncbi.nlm.nih.gov/datasets/ + +]]></help> <expand macro="citations"/> </tool>
