Mercurial > repos > iuc > ncbi_datasets
view datasets_genome.xml @ 20:35d32c807c23 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
| author | iuc |
|---|---|
| date | Fri, 26 Dec 2025 17:16:51 +0000 |
| parents | 9a10a6449901 |
| children |
line wrap: on
line source
<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> <description>download genome sequence, annotation and metadata</description> <macros> <import>macros.xml</import> </macros> <expand macro="bio_tools"/> <expand macro="requirements"/> <expand macro="version_command"/> <stdio> <regex match="Warning" source="stderr" level="warning" description=""/> <regex match="skipping" source="stderr" level="warning" description=""/> <regex match="ERROR" level="fatal"/> </stdio> <command detect_errors="exit_code"><![CDATA[ #import re @SETUP_CERTIFICATES@ datasets download genome $query.subcommand.download_by #if $query.subcommand.download_by == 'accession': #if $query.subcommand.text_or_file.text_or_file == 'text': #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x) #else --inputfile '$query.subcommand.text_or_file.inputfile' #end if #else: '$query.subcommand.taxon_positional' $query.subcommand.tax_exact_match #end if $filters.reference $filters.annotated #if $filters.assembly_level: --assembly-level $filters.assembly_level #end if --assembly-version $filters.assembly_version #if $filters.assembly_source: --assembly-source $filters.assembly_source #end if #if $filters.chromosomes: --chromosomes '$filters.chromosomes' #end if $filters.exclude_atypical #if $filters.mag: --mag '$filters.mag' #end if @INCLUDE@ @RELEASED_BEFORE@ @RELEASED_AFTER@ #for search_term in $filters.search: --search '$search_term.search' #end for --no-progressbar --dehydrated ## produce TSV report file && dataformat tsv genome --package ncbi_dataset.zip --fields #echo ",".join($file_choices.report_columns) > genome_data_report.tsv ## unzip and rehydrate if any data is to be downloaded (include is not None) #if $file_choices.include ## unzip && unzip ncbi_dataset.zip ## rehydrate && datasets rehydrate --directory ./ #if not $file_choices.decompress --gzip #end if --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip) ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; #if not $file_choices.decompress && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; #end if #if "seq-report" in $file_choices.include && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; #end if && true ## because Galaxy removes trailing ; from command #end if ]]></command> <inputs> <section name="query" title="Query" expanded="true"> <conditional name="subcommand"> <param name="download_by" type="select" label="Choose how to find genomes to download"> <option value="accession">By NCBI assembly or BioProject accession</option> <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option> </param> <when value="accession"> <expand macro="text_or_file"/> </when> <when value="taxon"> <expand macro="taxon_positional"/> <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> </when> </conditional> </section> <section name="filters" title="Filters and Limit"> <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> <expand macro="annotation"/> <expand macro="assembly_level"/> <param argument="--assembly-version" type="select" label="Assembly version(s)"> <option value="latest">Latest</option> <option value="all">All</option> </param> <expand macro="assembly_source"/> <expand macro="chromosomes"/> <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> <param argument="--mag" type="select" multiple="false" optional="true" label="Filter metagenome assembled genomes (MAGs)"> <option value="only" selected="false">Limit to MAGs</option> <option value="exclude" selected="false">Exclude MAGs</option> </param> <expand macro="released_options"/> <expand macro="released_options" before_or_after="after"/> <repeat name="search" title="Add search terms"> <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> </repeat> </section> <section name="file_choices" title="Output options" expanded="true"> <expand macro="tsv_report_columns"> <option value="accession" selected="true">accession</option> <option value="organism-name" selected="true">organism-name</option> <option value="assminfo-submitter" selected="true">assminfo-submitter</option> <option value="assminfo-name" selected="true">assminfo-name</option> </expand> <expand macro="include"> <expand macro="genome_includes"/> </expand> <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/> </section> </inputs> <outputs> <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> </collection> <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "genome" in file_choices['include']</filter> </collection> <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "rna" in file_choices['include']</filter> </collection> <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "protein" in file_choices['include']</filter> </collection> <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "cds" in file_choices['include']</filter> </collection> <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> </collection> <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> </collection> <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> </collection> </outputs> <tests> <test expect_num_outputs="3"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="human"/> </conditional> <section name="filters"> <param name="chromosomes" value="21"/> <param name="released_before" value="01/01/2018"/> </section> <section name="file_choices"> <!-- include a sequence (which should be downloaded as fasta.gz) and one non-sequence (which should be decompressed) output --> <param name="include" value="rna,gff3"/> </section> <output name="genome_data_report"> <assert_contents> <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> <has_n_lines min="140"/> <has_n_columns n="4"/> </assert_contents> </output> <output_collection name="rna_fasta" type="list"> <element name="GCF_000306695.2" decompress="true"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> <output_collection name="genomic_gff" type="list"> <element name="GCF_000306695.2"> <assert_contents> <has_n_lines min="1000000"/> <has_line line="##gff-version 3"/> <has_n_columns n="9" comment="#"/> </assert_contents> </element> </output_collection> <assert_command> <has_text text="gunzip"/> </assert_command> </test> <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="Norway rat"/> </conditional> <section name="filters"> <param name="chromosomes" value="MT"/> </section> <section name="file_choices"> <param name="include" value="genome"/> <param name="decompress" value="true"/> </section> <output_collection name="genome_fasta" type="list:list" count="9"> <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression=">"/> <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression=">"/> <!-- According to https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 --> <!-- <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/> <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> --> </output_collection> <output name="genome_data_report"> <assert_contents> <has_text text="Rattus norvegicus"/> <has_n_columns n="4"/> </assert_contents> </output> </test> <!-- same as previous test but assembly_source=refseq, which removes all of the genomes --> <test expect_failure="true"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="human"/> </conditional> <section name="filters"> <param name="chromosomes" value="21"/> <param name="assembly_level" value="chromosome,complete"/> <param name="assembly_source" value="refseq"/> <param name="released_before" value="01/01/2018"/> </section> <section name="file_choices"> <param name="include" value="genome"/> <param name="decompress" value="true"/> </section> <assert_stderr> <has_text text="no genome assemblies were found"/> </assert_stderr> <!-- In the current state of the NCBI tool/DB, no output to check. But the returned results seem to change from time to time and it might be necessary to re-enable this code block if the test fails in the future. --> <!-- <output_collection name="genome_fasta" type="list:list" count="2"> <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> </output_collection> <output name="genome_data_report"> <assert_contents> <has_text text="Homo sapiens"/> <has_n_lines n="5"/> <has_n_columns n="4"/> </assert_contents> </output> --> </test> <test expect_num_outputs="4"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="text"/> <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> </conditional> </conditional> <section name="filters"> <param name="released_before" value="01/01/2007"/> </section> <section name="file_choices"> <param name="include" value="seq-report,gtf,cds"/> <param name="decompress" value="true"/> </section> <output name="genome_data_report"> <assert_contents> <has_text text="GCF_000013305.1"/> <has_n_lines n="3"/> <has_n_columns n="4"/> </assert_contents> </output> <output_collection name="sequence_report" type="list" count="2"> <element name="GCF_000007445.1"> <assert_contents> <has_text text="GCF_000007445.1"/> <has_n_lines n="2"/> <has_n_columns n="15"/> </assert_contents> </element> <element name="GCF_000013305.1"> <assert_contents> <has_text text="GCF_000013305.1"/> <has_n_lines n="2"/> <has_n_columns n="15"/> </assert_contents> </element> </output_collection> <output_collection name="genomic_gtf" type="list" count="2"> <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> </output_collection> <output_collection name="genomic_cds" type="list"> <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> </output_collection> </test> <test expect_num_outputs="4"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="file"/> <param name="inputfile" value="accessions.txt"/> </conditional> </conditional> <section name="filters"> <param name="released_before" value="01/01/2007"/> </section> <section name="file_choices"> <param name="include" value="seq-report,gff3,gbff"/> <param name="decompress" value="true"/> </section> <output name="genome_data_report"> <assert_contents> <has_text text="GCF_000013305.1"/> <has_text text="GCF_000007445.1"/> <has_n_lines n="3"/> <has_n_columns n="4"/> </assert_contents> </output> <output_collection name="genomic_gff" type="list"> <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> </output_collection> <output_collection name="genomic_gbff" type="list"> <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> </output_collection> </test> <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="text"/> <param name="accession" value="GCF_000001405"/> </conditional> </conditional> <section name="filters"> <param name="released_before" value="01/01/2015"/> <param name="assembly_version" value="all"/> </section> <section name="file_choices"> <param name="include" value="seq-report"/> </section> <output name="genome_data_report"> <!-- assert that we get at least the 16 versions available at the time of writing this test --> <assert_contents> <has_text text="GCF_000001405" min="16"/> <has_n_lines min="16"/> <has_n_columns n="4"/> </assert_contents> </output> <!--not testing the collection output. the count will change over time and this can't be tested for at the moment <output_collection name="sequence_report" type="list" count="16"/> --> </test> <test expect_num_outputs="5"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="text"/> <param name="accession" value="GCF_000146045.2"/> </conditional> </conditional> <section name="file_choices"> <param name="include" value="genome,protein,rna,cds"/> <param name="decompress" value="true"/> </section> <output_collection name="genome_fasta" type="list:list" count="1"> <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> </output_collection> <output_collection name="protein_fasta" type="list" count="1"> <element name="GCF_000146045.2" decompress="true"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> <output_collection name="rna_fasta" type="list" count="1"> <element name="GCF_000146045.2" decompress="true"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> <!-- same as the previous test, but use the default value for decompress, see comment at the beginning of the tests --> <test expect_num_outputs="5"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="text"/> <param name="accession" value="GCF_000146045.2"/> </conditional> </conditional> <section name="file_choices"> <param name="include" value="genome,protein,rna,cds"/> </section> <output_collection name="genome_fasta" type="list:list" count="1"> <element name="GCF_000146045.2"> <element name="GCF_000146045.2_R64" ftype="fasta.gz"> <assert_contents> <has_size value="3843460" delta="2000"/> </assert_contents> </element> </element> </output_collection> <output_collection name="protein_fasta" type="list" count="1"> <element name="GCF_000146045.2" ftype="fasta.gz"> <assert_contents> <has_size value="1847862" delta="2000"/> </assert_contents> </element> </output_collection> <output_collection name="rna_fasta" type="list" count="1"> <element name="GCF_000146045.2" ftype="fasta.gz"> <assert_contents> <has_size min="2700000" max="2800000"/> </assert_contents> </element> </output_collection> </test> <test expect_num_outputs="3"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> <param name="text_or_file" value="text"/> <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> </conditional> </conditional> <section name="file_choices"> <param name="include" value="seq-report,genome"/> <param name="decompress" value="true"/> </section> <output_collection name="sequence_report" type="list" count="2"/> <output_collection name="genome_fasta" type="list:list" count="2"> <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/> <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/> </output_collection> </test> <!-- tax_exact_match should filter out strains https://github.com/ncbi/datasets/issues/187 --> <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="4932"/> <param name="tax_exact_match" value="true"/> </conditional> <output name="genome_data_report"> <assert_contents> <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> </assert_contents> </output> </test> <!-- test search filter --> <test expect_num_outputs="1"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> <param name="taxon_positional" value="Streptococcus"/> </conditional> <section name="filters"> <repeat name="search"> <param name="search" value="pyogenes"/> </repeat> </section> <section name="file_choices"> <param name="include" value_json="null"/> </section> <output name="genome_data_report"> <assert_contents> <has_text text="pyogenes"/> </assert_contents> </output> </test> </tests> <help><![CDATA[ .. class:: infomark **What it does** Downloads genome assemblies from NCBI using the `datasets`_ command-line tool. Retrieve genome sequences, annotations, and metadata by accession or taxon. **Query Options** - **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession - **By Taxon**: Taxonomy ID, scientific name, or common name **Filters** ==================== =============================================== Filter Description ==================== =============================================== Reference only Limit to reference/representative assemblies Annotated only Include only genomes with annotations Assembly level Chromosome, complete, contig, or scaffold Assembly source RefSeq (GCF\_) or GenBank (GCA\_) Exclude atypical Remove atypical assemblies (e.g., partial) MAG filter Include/exclude metagenome-assembled genomes Date range Filter by release date ==================== =============================================== ---- .. class:: warningmark **Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies. If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug). **Outputs** - **Data Report**: Tabular metadata for matching assemblies - **Genome FASTA**: Genomic sequences (nested collection by accession) - **Annotation files**: GFF3, GTF, GenBank flat files - **Protein/RNA/CDS**: Amino acid and nucleotide sequences - **Sequence Report**: Per-sequence metadata (chromosome, length, etc.) .. _datasets: https://www.ncbi.nlm.nih.gov/datasets/ ]]></help> <expand macro="citations"/> </tool>
