Mercurial > repos > iuc > ncbi_datasets
diff datasets_genome.xml @ 3:c87df3f9e19d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
| author | iuc |
|---|---|
| date | Thu, 27 Jan 2022 08:20:15 +0000 |
| parents | 2753a5786114 |
| children | d64df2210624 |
line wrap: on
line diff
--- a/datasets_genome.xml Thu Jul 15 15:45:43 2021 +0000 +++ b/datasets_genome.xml Thu Jan 27 08:20:15 2022 +0000 @@ -1,131 +1,170 @@ -<tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download assembled genomes from NCBI</description> +<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> + <description>download genome sequence, annotation and metadata</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"></expand> <command><![CDATA[ @SETUP_CERTIFICATES@ -datasets download genome $subcommand.download_by -#if $subcommand.download_by == 'accession': - #if $subcommand.text_or_file.text_or_file == 'text': - #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) +datasets download genome $query.subcommand.download_by +#if $query.subcommand.download_by == 'accession': + #if $query.subcommand.text_or_file.text_or_file == 'text': + #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) #else - --inputfile '$subcommand.text_or_file.inputfile' + --inputfile '$query.subcommand.text_or_file.inputfile' #end if #else: - '$subcommand.taxon' + '$query.subcommand.taxon' +#end if +$filters.reference +$filters.annotated +#if $filters.assembly_level: +--assembly_level $filters.assembly_level #end if -$annotated -$dehydrated -#if $assembly_level: ---assembly_level $assembly_level +#if $filters.assembly_source: +--assembly_source $filters.assembly_source #end if -#if $assembly_source: ---assembly_source $assembly_source +#if $filters.chromosomes: +--chromosomes '$filters.chromosomes' #end if ---chromosomes '$chromosomes' @EXCLUDES_GENOME@ @INCLUDES_GENOME@ -$reference @RELEASED_BEFORE@ @RELEASED_SINCE@ -#for search_term in $search: - --search '$search_term' +#for search_term in $filters.search: + --search '$filters.search_term' #end for -#if not $dehydrated: - && 7z x ncbi_dataset.zip +#if $uncompressed +&& unzip ncbi_dataset.zip +#else +&& unzip -l ncbi_dataset.zip > ncbi_dataset.txt #end if ]]></command> <inputs> - <conditional name="subcommand"> - <param name="download_by" type="select" label="Choose how to find genomes to download"> - <option value="accession">Download by NCBI assembly or BioProject accession</option> - <option value="taxon">Download by taxon</option> - </param> - <when value="accession"> - <expand macro="text_or_file"/> - </when> - <when value="taxon"> - <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param> - </when> - </conditional> - <expand macro="annotation"></expand> - <expand macro="dehydrated"></expand> - <expand macro="assembly_level"></expand> - <expand macro="assembly_source"></expand> - <expand macro="chromosomes"></expand> - <expand macro="excludes_genome"></expand> - <expand macro="includes_genome"></expand> - <expand macro="released_options"></expand> - <expand macro="released_options" before_or_after="since"></expand> - <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> - <repeat name="search" title="Add search terms"> - <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> - </repeat> + <section name="query" title="Query" expanded="true"> + <conditional name="subcommand"> + <param name="download_by" type="select" label="Choose how to find genomes to download"> + <option value="accession">Download by NCBI assembly or BioProject accession</option> + <option value="taxon">Download by taxon</option> + </param> + <when value="accession"> + <expand macro="text_or_file"/> + </when> + <when value="taxon"> + <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param> + </when> + </conditional> + </section> + <section name="filters" title="Filters and Limit"> + <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> + <expand macro="annotation"></expand> + <expand macro="assembly_level"></expand> + <expand macro="assembly_source"></expand> + <expand macro="chromosomes"></expand> + <expand macro="released_options"></expand> + <expand macro="released_options" before_or_after="since"></expand> + + <repeat name="search" title="Add search terms"> + <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> + </repeat> + </section> + <section name="file_choices" title="File Choices"> + <expand macro="excludes_genome"></expand> + <expand macro="includes_genome"></expand> + </section> + <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> </inputs> <outputs> - <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip"> - <filter>dehydrated</filter> + <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> + <filter>not uncompressed</filter> + </data> + <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> + <filter>not uncompressed</filter> </data> - <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_seq</filter> + <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> + <filter>uncompressed</filter> + </data> + <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed</filter> </collection> - <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_protein</filter> + <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/.*(?<!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_seq']</filter> </collection> - <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list"> + <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_genomic_cds']</filter> + </collection> + <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_gff3</filter> + <filter>uncompressed and file_choices['exclude_gff3']</filter> + </collection> + <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_rna']</filter> </collection> - <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list"> + <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_protein']</filter> + </collection> + <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['include_gbff']</filter> + </collection> + <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and include_gtf</filter> - </collection> - <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and include_gbff</filter> + <filter>uncompressed and file_choices['include_gtf']</filter> </collection> </outputs> <tests> - <test title="test dehydrated download by taxon"> - <conditional name="subcommand"> + <test expect_num_outputs="2"> + <conditional name="query|subcommand"> <param name="download_by" value="taxon"></param> <param name="text_or_file" value="text"></param> <param name="taxon" value="human"></param> </conditional> <param name="chromosomes" value="21"></param> - <param name="dehydrated" value="true"/> + <param name="uncompressed" value="false"/> <param name="released_before" value="01/01/2018"></param> - <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/> + <output name="archive_contents"> + <assert_contents> + <has_text text="ncbi_dataset/data/dataset_catalog.json"/> + </assert_contents> + </output> </test> - <test title="test download by comma-separated accession"> - <conditional name="subcommand"> + <test expect_num_outputs="5"> + <conditional name="query|subcommand"> <param name="download_by" value="accession"></param> <conditional name="text_or_file"> <param name="text_or_file" value="text"></param> <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param> </conditional> </conditional> - <param name="dehydrated" value="false"/> + <param name="uncompressed" value="true"/> <param name="released_before" value="01/01/2007"></param> - <output_collection name="genome_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> + <param name="exclude_genomic_cds" value="true"/> + <param name="include_gtf" value="true"/> + <output name="genome_data_report"> + <assert_contents> + <has_text text="GCF_000013305.1"/> + </assert_contents> + </output> + <output_collection name="sequence_report" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> </output_collection> - <output_collection name="protein_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> + <output_collection name="genomic_gtf" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> + <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> </output_collection> - <output_collection name="genomic_gff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> + <output_collection name="genomic_cds" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> </output_collection> </test> - <test title="test download by accessions listed in file"> - <conditional name="subcommand"> + <test expect_num_outputs="4"> + <conditional name="query|subcommand"> <param name="download_by" value="accession"></param> <conditional name="text_or_file"> <param name="text_or_file" value="file"></param> @@ -133,43 +172,44 @@ </conditional> </conditional> <param name="include_gbff" value="true"/> - <param name="include_gtf" value="true"/> - <param name="dehydrated" value="false"/> - <param name="released_before" value="01/01/2007"></param> - <output_collection name="genome_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> - </output_collection> - <output_collection name="protein_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> + <param name="exclude_seq" value="false"/> + <param name="exclude_gff3" value="true"/> + <param name="uncompressed" value="true"/> + <param name="released_before" value="01/02/2007"></param> + <output name="genome_data_report"> + <assert_contents> + <has_text text="SAMN02604181"/> + </assert_contents> + </output> + <output_collection name="sequence_report" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> </output_collection> <output_collection name="genomic_gff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> - </output_collection> - <output_collection name="genomic_gtf" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> + <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> + <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> </output_collection> <output_collection name="genomic_gbff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/> + <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> + <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> </output_collection> </test> </tests> <help> +<![CDATA[ +**Download Genome Datasets from NCBI** Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. -The default genome dataset includes the following files (if available): -* genomic.fna (genomic sequences) -* rna.fna (transcript sequences) -* protein.faa (protein sequences) -* genomic.gff (genome annotation in gff3 format) -* data_report.jsonl (data report with genome assembly and annotation metadata) -* dataset_catalog.json (a list of files and file types included in the dataset) +Tthe default genome dataset includes the following files (if available): + * genomic.fna (genomic sequences) + * rna.fna (transcript sequences) + * protein.faa (protein sequences) + * genomic.gff (genome annotation in gff3 format) + * data_report.jsonl (data report with genome assembly and annotation metadata) + * dataset_catalog.json (a list of files and file types included in the dataset) +]]> </help> </tool>
