ncbi_datasets: datasets_genome.xml comparison

comparison datasets_genome.xml @ 14:a222b4d3d52e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit d3fa7b70aa028f527a1dbbb210c172c637dfd4d9

author	iuc
date	Fri, 09 Dec 2022 15:11:04 +0000
parents	d979ba07ddd4
children	dfad868c911b

comparison

equal deleted inserted replaced

-:d979ba07ddd4
+:a222b4d3d52e
 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
 <description>download genome sequence, annotation and metadata</description>
 <macros>
 <import>macros.xml</import>
 </macros>
+<expand macro="bio_tools"/>
 <expand macro="requirements"></expand>
+<expand macro="version_command"/>
 <command><![CDATA[
+#import re
 @SETUP_CERTIFICATES@
 datasets download genome $query.subcommand.download_by
 #if $query.subcommand.download_by == 'accession':
 #if $query.subcommand.text_or_file.text_or_file == 'text':
-#echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
+#echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
 #else
 --inputfile '$query.subcommand.text_or_file.inputfile'
 #end if
 #else:
-'$query.subcommand.taxon'
+'$query.subcommand.taxon_positional'
 $query.subcommand.tax_exact_match
 #end if
 $filters.reference
 $filters.annotated
 #if $filters.assembly_level:
 @RELEASED_AFTER@
 #for search_term in $filters.search:
 --search '$filters.search_term'
 #end for
 --no-progressbar
-#if $uncompressed
+--dehydrated
-&& 7z x -y ncbi_dataset.zip
-#else
+## produce TSV report file
-&& 7z l ncbi_dataset.zip > ncbi_dataset.txt
+&& dataformat tsv genome
+--package ncbi_dataset.zip
+--fields #echo ",".join($file_choices.report_columns)
+> genome_data_report.tsv
+## unzip and rehydrate if any data is to be downloaded (include is not None)
+#if $file_choices.include
+## unzip
+&& 7z x -y ncbi_dataset.zip > 7z.log
+## rehydrate
+&& datasets rehydrate
+--directory ./
+#if not $file_choices.decompress
+--gzip
+#end if
+--max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
+## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
+&& find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
+## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
+## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
+##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
+&& find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
+#if not $file_choices.decompress
+&& find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+#end if
+#if "seq-report" in $file_choices.include
+&& find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
+#end if
+&& true  ## because Galaxy removes trailing ; from command
 #end if
 ]]></command>
 <inputs>
 <section name="query" title="Query" expanded="true">
 <conditional name="subcommand">
 <param name="download_by" type="select" label="Choose how to find genomes to download">
-<option value="accession">Download by NCBI assembly or BioProject accession</option>
+<option value="accession">By NCBI assembly or BioProject accession</option>
-<option value="taxon">Download by taxon</option>
+<option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
 </param>
 <when value="accession">
 <expand macro="text_or_file"/>
 </when>
 <when value="taxon">
-<param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>
+<expand macro="taxon_positional"/>
 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
 </when>
 </conditional>
 </section>
 <section name="filters" title="Filters and Limit">
 <expand macro="assembly_level"/>
 <param argument="--assembly-version" type="select" label="Assembly version(s)">
 <option value="latest">Latest</option>
 <option value="all">All</option>
 </param>
-<!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank-->
 <expand macro="assembly_source"/>
 <expand macro="chromosomes"/>
 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
 <expand macro="released_options"/>
 <expand macro="released_options" before_or_after="after"/>
 <repeat name="search" title="Add search terms">
 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
 </repeat>
 </section>
-<section name="file_choices" title="File Choices" expanded="true">
+<section name="file_choices" title="Output options" expanded="true">
-<expand macro="include"/>
+<expand macro="tsv_report_columns">
+<option value="accession" selected="true">accession</option>
+<option value="organism-name" selected="true">organism-name</option>
+<option value="assminfo-submitter" selected="true">assminfo-submitter</option>
+<option value="assminfo-name" selected="true">assminfo-name</option>
+</expand>
+<expand macro="include">
+<expand macro="genome_includes"/>
+</expand>
+<param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
 </section>
-<param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
 </inputs>
 <outputs>
-<data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
+<data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
-<filter>not uncompressed</filter>
-</data>
-<data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
-<filter>not uncompressed</filter>
-</data>
-<data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
-<filter>uncompressed</filter>
-</data>
 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
-<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter>
+<filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
 </collection>
 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
-<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter>
+<filter>file_choices['include'] and "genome" in file_choices['include']</filter>
 </collection>
 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
-<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter>
+<filter>file_choices['include'] and "rna" in file_choices['include']</filter>
 </collection>
 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
-<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter>
+<filter>file_choices['include'] and "protein" in file_choices['include']</filter>
 </collection>
 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
-<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter>
+<filter>file_choices['include'] and "cds" in file_choices['include']</filter>
 </collection>
 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter>
+<filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
 </collection>
 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter>
+<filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
 </collection>
 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-<filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter>
+<filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
 </collection>
 </outputs>
 <tests>
+<!-- Note: All but one test use the non-default decompress="true"
+this is because (at 11/22) Galaxy can not apply text assertions on the content
+of compressed files https://github.com/galaxyproject/galaxy/pull/15085
+So with decompress="true" more powerfull assertions are powerful.
+A single test checks the default, ie decompress="false".
+-->
+<test expect_num_outputs="3">
+<conditional name="query|subcommand">
+<param name="download_by" value="taxon"/>
+<param name="taxon_positional" value="human"/>
+</conditional>
+<param name="chromosomes" value="21"/>
+<param name="released_before" value="01/01/2018"/>
+<section name="file_choices">
+<!-- include a sequence (which should be downloaded as fasta.gz)
+and one non-sequence (which should be decompressed) output -->
+<param name="include" value="rna,gff3"/>
+</section>
+<output name="genome_data_report">
+<assert_contents>
+<has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
+<has_n_lines n="144"/>
+<has_n_columns n="4"/>
+</assert_contents>
+</output>
+<output_collection name="rna_fasta" type="list" count="1">
+<element name="GCF_000306695.2" decompress="true">
+<assert_contents>
+<has_text text=">"/>
+</assert_contents>
+</element>
+</output_collection>
+<output_collection name="genomic_gff" type="list">
+<element name="GCF_000306695.2">
+<assert_contents>
+<has_n_lines min="1000000"/>
+<has_line line="##gff-version 3"/>
+<!-- TODO this will only work when the galaxy python packakes for 22.05 have been released
+<has_n_columns n="9" comment="#"/> -->
+</assert_contents>
+</element>
+</output_collection>
+<assert_command>
+<has_text text="gunzip"/>
+</assert_command>
+</test>
 <test expect_num_outputs="2">
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
-<param name="text_or_file" value="text"/>
+<param name="taxon_positional" value="human"/>
-<param name="taxon" value="human"/>
 </conditional>
 <param name="chromosomes" value="21"/>
-<param name="include" value=""/>
-<param name="uncompressed" value="false"/>
-<param name="released_before" value="01/01/2018"/>
-<output name="archive_contents">
-<assert_contents>
-<has_text text="ncbi_dataset/data/dataset_catalog.json"/>
-</assert_contents>
-</output>
-</test>
-<test expect_num_outputs="2">
-<conditional name="query|subcommand">
-<param name="download_by" value="taxon"/>
-<param name="text_or_file" value="text"/>
-<param name="taxon" value="human"/>
-</conditional>
-<param name="chromosomes" value="21"/>
-<param name="include" value="genome"/>
-<param name="uncompressed" value="true"/>
 <param name="assembly_level" value="chromosome,complete"/>
 <param name="released_before" value="01/01/2018"/>
+<section name="file_choices">
+<param name="include" value="genome"/>
+<param name="decompress" value="true"/>
+</section>
 <output_collection name="genome_fasta" type="list:list" count="14">
 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
 </output_collection>
 <output name="genome_data_report">
 <assert_contents>
 <has_text text="Homo sapiens"/>
-</assert_contents>
+<has_n_columns n="4"/>
-</output>
+</assert_contents>
-</test>
+</output>
-<!-- same as precious test but assembly_source (refseq which removes some of the genomes) -->
+</test>
+<!-- same as previous test but assembly_source (refseq which removes some of the genomes) -->
 <test expect_num_outputs="2">
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
-<param name="text_or_file" value="text"/>
+<param name="taxon_positional" value="human"/>
-<param name="taxon" value="human"/>
 </conditional>
 <param name="chromosomes" value="21"/>
-<param name="include" value="genome"/>
-<param name="uncompressed" value="true"/>
 <param name="assembly_level" value="chromosome,complete"/>
 <param name="assembly_source" value="refseq"/>
 <param name="released_before" value="01/01/2018"/>
+<section name="file_choices">
+<param name="include" value="genome"/>
+<param name="decompress" value="true"/>
+</section>
 <output_collection name="genome_fasta" type="list:list" count="2">
 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
 </output_collection>
 <output name="genome_data_report">
 <assert_contents>
 <has_text text="Homo sapiens"/>
+<has_n_lines n="5"/>
+<has_n_columns n="4"/>
 </assert_contents>
 </output>
 </test>
 <test expect_num_outputs="4">
 <conditional name="query|subcommand">
 <conditional name="text_or_file">
 <param name="text_or_file" value="text"/>
 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
 </conditional>
 </conditional>
-<param name="include" value="seq-report,gtf,cds"/>
-<param name="uncompressed" value="true"/>
 <param name="released_before" value="01/01/2007"/>
+<section name="file_choices">
+<param name="include" value="seq-report,gtf,cds"/>
+<param name="decompress" value="true"/>
+</section>
 <output name="genome_data_report">
 <assert_contents>
 <has_text text="GCF_000013305.1"/>
-</assert_contents>
+<has_n_lines n="3"/>
-</output>
+<has_n_columns n="4"/>
+</assert_contents>
+</output>
+<output_collection name="sequence_report" type="list" count="2" >
+<element name="GCF_000007445.1">
+<assert_contents>
+<has_text text="GCF_000007445.1"/>
+<has_n_lines n="2"/>
+<has_n_columns n="14"/>
+</assert_contents>
+</element>
+<element name="GCF_000013305.1">
+<assert_contents>
+<has_text text="GCF_000013305.1"/>
+<has_n_lines n="2"/>
+<has_n_columns n="14"/>
+</assert_contents>
+</element>
+</output_collection>
 <output_collection name="genomic_gtf" type="list">
 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
 </output_collection>
 <output_collection name="genomic_cds" type="list">
-<element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
+<element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/>
-<element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
+<element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/>
 </output_collection>
 </test>
 <test expect_num_outputs="4">
 <conditional name="query|subcommand">
 <param name="download_by" value="accession"/>
 <conditional name="text_or_file">
 <param name="text_or_file" value="file"/>
 <param name="inputfile" value="accessions.txt"/>
 </conditional>
 </conditional>
-<param name="include" value="seq-report,gbff,gff3"/>
-<param name="uncompressed" value="true"/>
 <param name="released_before" value="01/01/2007"/>
-<output name="genome_data_report">
+<section name="file_choices">
-<assert_contents>
+<param name="include" value="seq-report,gff3,gbff"/>
-<has_text text="SAMN02604181"/>
+<param name="decompress" value="true"/>
+</section>
+<output name="genome_data_report">
+<assert_contents>
+<has_text text="GCF_000013305.1"/>
+<has_text text="GCF_000007445.1"/>
+<has_n_lines n="3"/>
+<has_n_columns n="4"/>
 </assert_contents>
 </output>
 <output_collection name="genomic_gff" type="list">
 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
 <output_collection name="genomic_gbff" type="list">
 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
 </output_collection>
 </test>
-<test expect_num_outputs="2">
+<!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
+<test expect_num_outputs="2" expect_failure="true">
 <conditional name="query|subcommand">
 <param name="download_by" value="accession"/>
 <conditional name="text_or_file">
 <param name="text_or_file" value="text"/>
 <param name="accession" value="GCF_000001405"/>
 </conditional>
 </conditional>
-<param name="include" value="seq-report"/>
-<param name="uncompressed" value="true"/>
 <param name="released_before" value="01/01/2015"/>
 <param name="assembly_version" value="all"/>
-<output_collection name="sequence_report" count="4">
+<section name="file_choices">
-<element name="GCF_000001405.25">
+<param name="include" value="seq-report"/>
-<assert_contents>
+</section>
-<has_text text="assignedMoleculeLocationType"/>
+<!--
-</assert_contents>
+<output_collection name="sequence_report" type="list" count="4" >
-</element>
+-->
-<element name="GCF_000001405.26">
-<assert_contents>
-<has_text text="assignedMoleculeLocationType"/>
-</assert_contents>
-</element>
-<element name="GCF_000001405.27">
-<assert_contents>
-<has_text text="assignedMoleculeLocationType"/>
-</assert_contents>
-</element>
-<element name="GCF_000001405.28">
-<assert_contents>
-<has_text text="assignedMoleculeLocationType"/>
-</assert_contents>
-</element>
-</output_collection>
 </test>
 <test expect_num_outputs="5">
 <conditional name="query|subcommand">
 <param name="download_by" value="accession"/>
 <conditional name="text_or_file">
 <param name="text_or_file" value="text"/>
 <param name="accession" value="GCF_000146045.2"/>
 </conditional>
 </conditional>
-<param name="include" value="seq-report,genome,rna,cds"/>
+<section name="file_choices">
-<param name="uncompressed" value="true"/>
+<param name="include" value="genome,protein,rna,cds"/>
+<param name="decompress" value="true"/>
+</section>
 <output_collection name="genome_fasta" type="list:list" count="1">
 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
 </output_collection>
+<output_collection name="protein_fasta" type="list" count="1">
+<element name="GCF_000146045.2" decompress="true">
+<assert_contents>
+<has_text text=">"/>
+</assert_contents>
+</element>
+</output_collection>
+<output_collection name="rna_fasta" type="list" count="1">
+<element name="GCF_000146045.2" decompress="true">
+<assert_contents>
+<has_text text=">"/>
+</assert_contents>
+</element>
+</output_collection>
+</test>
+<!-- same as the previous test, but use the default value for decompress,
+see comment at the beginning of the tests -->
+<test expect_num_outputs="5">
+<conditional name="query|subcommand">
+<param name="download_by" value="accession"/>
+<conditional name="text_or_file">
+<param name="text_or_file" value="text"/>
+<param name="accession" value="GCF_000146045.2"/>
+</conditional>
+</conditional>
+<section name="file_choices">
+<param name="include" value="genome,protein,rna,cds"/>
+</section>
+<output_collection name="genome_fasta" type="list:list" count="1">
+<element name="GCF_000146045.2">
+<element name="GCF_000146045.2_R64" ftype="fasta.gz">
+<assert_contents>
+<has_size value="3843460"/>
+</assert_contents>
+</element>
+</element>
+</output_collection>
+<output_collection name="protein_fasta" type="list" count="1">
+<element name="GCF_000146045.2" ftype="fasta.gz">
+<assert_contents>
+<has_size value="1844838"/>
+</assert_contents>
+</element>
+</output_collection>
+<output_collection name="rna_fasta" type="list" count="1">
+<element name="GCF_000146045.2" ftype="fasta.gz">
+<assert_contents>
+<has_size value="2784534"/>
+</assert_contents>
+</element>
+</output_collection>
 </test>
 <test expect_num_outputs="3">
 <conditional name="query|subcommand">
 <param name="download_by" value="accession"/>
 <conditional name="text_or_file">
 <param name="text_or_file" value="text"/>
 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
 </conditional>
 </conditional>
-<param name="include" value="seq-report,genome"/>
+<section name="file_choices">
-<param name="uncompressed" value="true"/>
+<param name="include" value="seq-report,genome"/>
+<param name="decompress" value="true"/>
+</section>
+<output_collection name="sequence_report" type="list" count="2"/>
 <output_collection name="genome_fasta" type="list:list" count="2">
 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
 </output_collection>
 </test>
 https://github.com/ncbi/datasets/issues/187
 hence we set  expect_test_failure="true"-->
 <test expect_num_outputs="1" expect_test_failure="true">
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
-<param name="text_or_file" value="text"/>
+<param name="taxon_positional" value="4932"/>
-<param name="taxon" value="4932"/>
 <param name="tax_exact_match" value="true"/>
 </conditional>
 <param name="include" value=""/>
-<param name="uncompressed" value="true"/>
 <output name="genome_data_report">
 <assert_contents>
 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
 </assert_contents>
 </output>
 <help>
 <![CDATA[
 **Download Genome Datasets from NCBI**
 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
-Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
+Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
-Tthe default genome dataset includes the following files (if available):
+The download is a three step process:
-* data_report.jsonl (genome assembly and annotation metadata, not always available)
-* genomic.fna (genomic sequences)
+1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
-* rna.fna (transcript sequences)
+2. The metadata is transformed into a tabular (TSV) file
-* protein.faa (protein sequences)
+3. The data is hydrated (the actual data is downloaded)
-* genomic.gff (genome annotation in gff3 format)
-* dataset_catalog.json (a list of files and file types included in the dataset)
+The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
+Thereby its possible to inspect the metadata prior to the actual data download. Also this
+allows to use the tool for querying data sets (and their accessions) of interest which
+can then be downloaded in a second call using the accessions.
 ]]>
 </help>
+<expand macro="citations"/>
 </tool>

Mercurial > repos > iuc > ncbi_datasets

comparison datasets_genome.xml @ 14:a222b4d3d52e draft