view datasets_genome.xml @ 20:35d32c807c23 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
author iuc
date Fri, 26 Dec 2025 17:16:51 +0000
parents 9a10a6449901
children
line wrap: on
line source

<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>download genome sequence, annotation and metadata</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <stdio>
        <regex match="Warning" source="stderr" level="warning" description=""/>
        <regex match="skipping" source="stderr" level="warning" description=""/>
        <regex match="ERROR" level="fatal"/>
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
#import re
@SETUP_CERTIFICATES@
datasets download genome $query.subcommand.download_by
#if $query.subcommand.download_by == 'accession':
    #if $query.subcommand.text_or_file.text_or_file == 'text':
        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
    #else
        --inputfile '$query.subcommand.text_or_file.inputfile'
    #end if
#else:
    '$query.subcommand.taxon_positional'
    $query.subcommand.tax_exact_match
#end if
$filters.reference
$filters.annotated
#if $filters.assembly_level:
    --assembly-level $filters.assembly_level
#end if
--assembly-version $filters.assembly_version
#if $filters.assembly_source:
    --assembly-source $filters.assembly_source
#end if
#if $filters.chromosomes:
    --chromosomes '$filters.chromosomes'
#end if
$filters.exclude_atypical
#if $filters.mag:
    --mag '$filters.mag'
#end if

@INCLUDE@
@RELEASED_BEFORE@
@RELEASED_AFTER@
#for search_term in $filters.search:
    --search '$search_term.search'
#end for
--no-progressbar
--dehydrated

## produce TSV report file
&& dataformat tsv genome 
    --package ncbi_dataset.zip
    --fields #echo ",".join($file_choices.report_columns) 
    > genome_data_report.tsv

## unzip and rehydrate if any data is to be downloaded (include is not None)
#if $file_choices.include
    ## unzip
    && unzip ncbi_dataset.zip

    ## rehydrate
    && datasets rehydrate
        --directory ./
        #if not $file_choices.decompress
            --gzip
        #end if
        --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}

    ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
    && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;

    ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
    ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
    && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
    #if not $file_choices.decompress
        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
    #end if

    #if "seq-report" in $file_choices.include
        && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
    #end if
    
    && true  ## because Galaxy removes trailing ; from command
#end if
]]></command>
    <inputs>
        <section name="query" title="Query" expanded="true">
            <conditional name="subcommand">
                <param name="download_by" type="select" label="Choose how to find genomes to download">
                    <option value="accession">By NCBI assembly or BioProject accession</option>
                    <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
                </param>
                <when value="accession">
                    <expand macro="text_or_file"/>
                </when>
                <when value="taxon">
                    <expand macro="taxon_positional"/>
                    <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
                </when>
            </conditional>
        </section>
        <section name="filters" title="Filters and Limit">
            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
            <expand macro="annotation"/>
            <expand macro="assembly_level"/>
            <param argument="--assembly-version" type="select" label="Assembly version(s)">
                <option value="latest">Latest</option>
                <option value="all">All</option>
            </param>
            <expand macro="assembly_source"/>
            <expand macro="chromosomes"/>
            <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
            <param argument="--mag" type="select" multiple="false" optional="true" label="Filter metagenome assembled genomes (MAGs)">
                <option value="only" selected="false">Limit to MAGs</option>
                <option value="exclude" selected="false">Exclude MAGs</option>
            </param>
            <expand macro="released_options"/>
            <expand macro="released_options" before_or_after="after"/>
            <repeat name="search" title="Add search terms">
                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
            </repeat>
        </section>
        <section name="file_choices" title="Output options" expanded="true">
            <expand macro="tsv_report_columns">
                <option value="accession" selected="true">accession</option>
                <option value="organism-name" selected="true">organism-name</option>
                <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
                <option value="assminfo-name" selected="true">assminfo-name</option>
            </expand>
            <expand macro="include">
                <expand macro="genome_includes"/>
            </expand>
            <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
        </section>
    </inputs>
    <outputs>
        <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
        </collection>
        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
        </collection>
        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
        </collection>
        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <section name="filters">
                <param name="chromosomes" value="21"/>
                <param name="released_before" value="01/01/2018"/>
            </section>
            <section name="file_choices">
                <!-- include a sequence (which should be downloaded as fasta.gz)
                     and one non-sequence (which should be decompressed) output -->
                <param name="include" value="rna,gff3"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
                    <has_n_lines min="140"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="rna_fasta" type="list">
                <element name="GCF_000306695.2" decompress="true">
                    <assert_contents>
                        <has_text text="&gt;"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000306695.2">
                    <assert_contents>
                        <has_n_lines min="1000000"/>
                        <has_line line="##gff-version 3"/>
                        <has_n_columns n="9" comment="#"/>
                    </assert_contents>
                </element>
            </output_collection>
            <assert_command>
                <has_text text="gunzip"/>
            </assert_command>
        </test>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="Norway rat"/>
            </conditional>
            <section name="filters">
                <param name="chromosomes" value="MT"/>
            </section>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="9">
                <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression="&gt;"/>
                <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression="&gt;"/>
                <!-- According to  https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 -->
                <!--
                <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
                -->
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Rattus norvegicus"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <!-- same as previous test but assembly_source=refseq, which removes all of the genomes -->
        <test expect_failure="true">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <section name="filters">
                <param name="chromosomes" value="21"/>
                <param name="assembly_level" value="chromosome,complete"/>
                <param name="assembly_source" value="refseq"/>
                <param name="released_before" value="01/01/2018"/>
            </section>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <assert_stderr>
                <has_text text="no genome assemblies were found"/>
            </assert_stderr>
            <!-- In the current state of the NCBI tool/DB, no output to check.
             But the returned results seem to change from time to time and it might
             be necessary to re-enable this code block if the test fails in the future. -->
            <!--
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                    <has_n_lines n="5"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output> -->
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                </conditional>
            </conditional>
            <section name="filters">
                <param name="released_before" value="01/01/2007"/>
            </section>
            <section name="file_choices">
                <param name="include" value="seq-report,gtf,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="sequence_report" type="list" count="2">
                <element name="GCF_000007445.1">
                    <assert_contents>
                        <has_text text="GCF_000007445.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="15"/>
                    </assert_contents>
                </element>
                <element name="GCF_000013305.1">
                    <assert_contents>
                        <has_text text="GCF_000013305.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="15"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="genomic_gtf" type="list" count="2">
                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_cds" type="list">
                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="file"/>
                    <param name="inputfile" value="accessions.txt"/>
                </conditional>
            </conditional>
            <section name="filters">
                <param name="released_before" value="01/01/2007"/>
            </section>
            <section name="file_choices">
                <param name="include" value="seq-report,gff3,gbff"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                    <has_text text="GCF_000007445.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_gbff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
            </output_collection>
        </test>
        <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000001405"/>
                </conditional>
            </conditional>
            <section name="filters">
                <param name="released_before" value="01/01/2015"/>
                <param name="assembly_version" value="all"/>
            </section>
            <section name="file_choices">
                <param name="include" value="seq-report"/>
            </section>
            <output name="genome_data_report">
                <!-- assert that we get at least the 16 versions available at the time of writing this test -->
                <assert_contents>
                    <has_text text="GCF_000001405" min="16"/>
                    <has_n_lines min="16"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <!--not testing the collection output. the count will change over time
                and this can't be tested for at the moment
                <output_collection name="sequence_report" type="list" count="16"/> -->
        </test>
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text="&gt;"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text="&gt;"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- same as the previous test, but use the default value for decompress,
            see comment at the beginning of the tests -->
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <element name="GCF_000146045.2">
                    <element name="GCF_000146045.2_R64" ftype="fasta.gz">
                        <assert_contents>
                            <has_size value="3843460" delta="2000"/>
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size value="1847862" delta="2000"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size min="2700000" max="2800000"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="seq-report,genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="sequence_report" type="list" count="2"/>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression="&gt;NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
            </output_collection>
        </test>
        <!-- tax_exact_match should filter out strains
             https://github.com/ncbi/datasets/issues/187 -->
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="4932"/>
                <param name="tax_exact_match" value="true"/>
            </conditional>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                </assert_contents>
            </output>
        </test>
        <!-- test search filter -->
        <test expect_num_outputs="1">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="Streptococcus"/>
            </conditional>
            <section name="filters">
                <repeat name="search">
                    <param name="search" value="pyogenes"/>
                </repeat>
            </section>
            <section name="file_choices">
                <param name="include" value_json="null"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="pyogenes"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

Downloads genome assemblies from NCBI using the `datasets`_ command-line tool.
Retrieve genome sequences, annotations, and metadata by accession or taxon.

**Query Options**

- **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession
- **By Taxon**: Taxonomy ID, scientific name, or common name

**Filters**

====================  ===============================================
Filter                Description
====================  ===============================================
Reference only        Limit to reference/representative assemblies
Annotated only        Include only genomes with annotations
Assembly level        Chromosome, complete, contig, or scaffold
Assembly source       RefSeq (GCF\_) or GenBank (GCA\_)
Exclude atypical      Remove atypical assemblies (e.g., partial)
MAG filter            Include/exclude metagenome-assembled genomes
Date range            Filter by release date
====================  ===============================================

----

.. class:: warningmark

**Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies.
If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results
with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug).

**Outputs**

- **Data Report**: Tabular metadata for matching assemblies
- **Genome FASTA**: Genomic sequences (nested collection by accession)
- **Annotation files**: GFF3, GTF, GenBank flat files
- **Protein/RNA/CDS**: Amino acid and nucleotide sequences
- **Sequence Report**: Per-sequence metadata (chromosome, length, etc.)

.. _datasets: https://www.ncbi.nlm.nih.gov/datasets/

]]></help>
    <expand macro="citations"/>
</tool>