Mercurial > repos > iuc > ncbi_datasets

diff datasets_genome.xml @ 3:c87df3f9e19d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
author: iuc
date: Thu, 27 Jan 2022 08:20:15 +0000
parents: 2753a5786114
children: d64df2210624
--- a/datasets_genome.xml	Thu Jul 15 15:45:43 2021 +0000
+++ b/datasets_genome.xml	Thu Jan 27 08:20:15 2022 +0000
@@ -1,131 +1,170 @@
-<tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download assembled genomes from NCBI</description>
+<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
+    <description>download genome sequence, annotation and metadata</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"></expand>
     <command><![CDATA[
 @SETUP_CERTIFICATES@
-datasets download genome $subcommand.download_by
-#if $subcommand.download_by == 'accession':
-    #if $subcommand.text_or_file.text_or_file == 'text':
-        #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x)
+datasets download genome $query.subcommand.download_by
+#if $query.subcommand.download_by == 'accession':
+    #if $query.subcommand.text_or_file.text_or_file == 'text':
+        #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
     #else
-        --inputfile '$subcommand.text_or_file.inputfile'
+        --inputfile '$query.subcommand.text_or_file.inputfile'
     #end if
 #else:
-    '$subcommand.taxon'
+    '$query.subcommand.taxon'
+#end if
+$filters.reference
+$filters.annotated
+#if $filters.assembly_level:
+--assembly_level $filters.assembly_level
 #end if
-$annotated
-$dehydrated
-#if $assembly_level:
---assembly_level $assembly_level
+#if $filters.assembly_source:
+--assembly_source $filters.assembly_source
 #end if
-#if $assembly_source:
---assembly_source $assembly_source
+#if $filters.chromosomes:
+--chromosomes '$filters.chromosomes'
 #end if
---chromosomes '$chromosomes'
 @EXCLUDES_GENOME@
 @INCLUDES_GENOME@
-$reference
 @RELEASED_BEFORE@
 @RELEASED_SINCE@
-#for search_term in $search:
-    --search '$search_term'
+#for search_term in $filters.search:
+    --search '$filters.search_term'
 #end for
-#if not $dehydrated:
-    && 7z x ncbi_dataset.zip
+#if $uncompressed
+&& unzip ncbi_dataset.zip
+#else
+&& unzip -l ncbi_dataset.zip > ncbi_dataset.txt
 #end if
 ]]></command>
     <inputs>
-        <conditional name="subcommand">
-            <param name="download_by" type="select" label="Choose how to find genomes to download">
-                <option value="accession">Download by NCBI assembly or BioProject accession</option>
-                <option value="taxon">Download by taxon</option>
-            </param>
-            <when value="accession">
-                <expand macro="text_or_file"/>
-            </when>
-            <when value="taxon">
-                <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param>
-            </when>
-        </conditional>
-        <expand macro="annotation"></expand>
-        <expand macro="dehydrated"></expand>
-        <expand macro="assembly_level"></expand>
-        <expand macro="assembly_source"></expand>
-        <expand macro="chromosomes"></expand>
-        <expand macro="excludes_genome"></expand>
-        <expand macro="includes_genome"></expand>
-        <expand macro="released_options"></expand>
-        <expand macro="released_options" before_or_after="since"></expand>
-        <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
-        <repeat name="search" title="Add search terms">
-            <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
-        </repeat>
+        <section name="query" title="Query" expanded="true">
+            <conditional name="subcommand">
+                <param name="download_by" type="select" label="Choose how to find genomes to download">
+                    <option value="accession">Download by NCBI assembly or BioProject accession</option>
+                    <option value="taxon">Download by taxon</option>
+                </param>
+                <when value="accession">
+                    <expand macro="text_or_file"/>
+                </when>
+                <when value="taxon">
+                    <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param>
+                </when>
+            </conditional>
+        </section>
+        <section name="filters" title="Filters and Limit">
+            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>            
+            <expand macro="annotation"></expand>
+            <expand macro="assembly_level"></expand>
+            <expand macro="assembly_source"></expand>
+            <expand macro="chromosomes"></expand>
+            <expand macro="released_options"></expand>
+            <expand macro="released_options" before_or_after="since"></expand>
+
+            <repeat name="search" title="Add search terms">
+                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
+            </repeat>
+        </section>
+        <section name="file_choices" title="File Choices">
+            <expand macro="excludes_genome"></expand>
+            <expand macro="includes_genome"></expand>
+        </section>
+        <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
     </inputs>
     <outputs>
-        <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip">
-            <filter>dehydrated</filter>
+        <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
+            <filter>not uncompressed</filter>
+        </data>
+        <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
+            <filter>not uncompressed</filter>
         </data>
-        <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_seq</filter>
+        <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
+            <filter>uncompressed</filter>
+        </data>
+        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed</filter>
         </collection>
-        <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_protein</filter>
+        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*(?&lt;!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_seq']</filter>
         </collection>
-        <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list">
+        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_genomic_cds']</filter>
+        </collection>
+        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_gff3</filter>
+            <filter>uncompressed and file_choices['exclude_gff3']</filter>
+        </collection>
+        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_rna']</filter>
         </collection>
-        <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list">
+        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_protein']</filter>
+        </collection>
+        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['include_gbff']</filter>
+        </collection>
+        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and include_gtf</filter>
-        </collection>
-        <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and include_gbff</filter>
+            <filter>uncompressed and file_choices['include_gtf']</filter>
         </collection>
     </outputs>
     <tests>
-        <test title="test dehydrated download by taxon">
-            <conditional name="subcommand">
+        <test expect_num_outputs="2">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"></param>
                 <param name="text_or_file" value="text"></param>
                 <param name="taxon" value="human"></param>
             </conditional>
             <param name="chromosomes" value="21"></param>
-            <param name="dehydrated" value="true"/>
+            <param name="uncompressed" value="false"/>
             <param name="released_before" value="01/01/2018"></param>
-            <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/>
+            <output name="archive_contents">
+                <assert_contents>
+                    <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
+                </assert_contents>
+            </output>
         </test>
-        <test title="test download by comma-separated accession">
-            <conditional name="subcommand">
+        <test expect_num_outputs="5">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="accession"></param>
                 <conditional name="text_or_file">
                     <param name="text_or_file" value="text"></param>
                     <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param>
                 </conditional>
             </conditional>
-            <param name="dehydrated" value="false"/>
+            <param name="uncompressed" value="true"/>
             <param name="released_before" value="01/01/2007"></param>
-            <output_collection name="genome_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
+            <param name="exclude_genomic_cds" value="true"/>
+            <param name="include_gtf" value="true"/>
+            <output name="genome_data_report">
+                <assert_contents>
+                    <has_text text="GCF_000013305.1"/>
+                </assert_contents>
+            </output>
+            <output_collection name="sequence_report" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
             </output_collection>
-            <output_collection name="protein_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            <output_collection name="genomic_gtf" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
             </output_collection>
-            <output_collection name="genomic_gff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
+            <output_collection name="genomic_cds" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
             </output_collection>
         </test>
-        <test title="test download by accessions listed in file">
-            <conditional name="subcommand">
+        <test expect_num_outputs="4">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="accession"></param>
                 <conditional name="text_or_file">
                     <param name="text_or_file" value="file"></param>
@@ -133,43 +172,44 @@
                 </conditional>
             </conditional>
             <param name="include_gbff" value="true"/>
-            <param name="include_gtf" value="true"/>
-            <param name="dehydrated" value="false"/>
-            <param name="released_before" value="01/01/2007"></param>
-            <output_collection name="genome_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
-            </output_collection>
-            <output_collection name="protein_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            <param name="exclude_seq" value="false"/>
+            <param name="exclude_gff3" value="true"/>
+            <param name="uncompressed" value="true"/>
+            <param name="released_before" value="01/02/2007"></param>
+            <output name="genome_data_report">
+                <assert_contents>
+                   <has_text text="SAMN02604181"/>
+                </assert_contents>
+            </output>
+            <output_collection name="sequence_report" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
             </output_collection>
             <output_collection name="genomic_gff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
-            </output_collection>
-            <output_collection name="genomic_gtf" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
+                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
             </output_collection>
             <output_collection name="genomic_gbff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/>
+                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
             </output_collection>
         </test>
     </tests>
     <help>
+<![CDATA[
+**Download Genome Datasets from NCBI**
 
 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
 
-The default genome dataset includes the following files (if available):
-* genomic.fna (genomic sequences)
-* rna.fna (transcript sequences)
-* protein.faa (protein sequences)
-* genomic.gff (genome annotation in gff3 format)
-* data_report.jsonl (data report with genome assembly and annotation metadata)
-* dataset_catalog.json (a list of files and file types included in the dataset)
+Tthe default genome dataset includes the following files (if available):
+ * genomic.fna (genomic sequences)
+ * rna.fna (transcript sequences)
+ * protein.faa (protein sequences)
+ * genomic.gff (genome annotation in gff3 format)
+ * data_report.jsonl (data report with genome assembly and annotation metadata)
+ * dataset_catalog.json (a list of files and file types included in the dataset)
+]]>
     </help>
 
 </tool>
author	iuc
date	Thu, 27 Jan 2022 08:20:15 +0000
parents	2753a5786114
children	d64df2210624