Mercurial > repos > iuc > ncbi_datasets

diff datasets_genome.xml @ 0:c6009f4d7261 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit fd91cf3000d556d8219426eddb8a3059071a2009"
author: iuc
date: Thu, 15 Jul 2021 13:31:56 +0000
children: 2753a5786114
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets_genome.xml	Thu Jul 15 13:31:56 2021 +0000
@@ -0,0 +1,175 @@
+<tool id="datasets-download-genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
+    <description>Download assembled genomes from NCBI</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"></expand>
+    <command><![CDATA[
+@SETUP_CERTIFICATES@
+datasets download genome $subcommand.download_by
+#if $subcommand.download_by == 'accession':
+    #if $subcommand.text_or_file.text_or_file == 'text':
+        '$subcommand.text_or_file.accession'
+    #else
+        --inputfile '$subcommand.text_or_file.inputfile'
+    #end if
+#else:
+    '$subcommand.taxon'
+#end if
+$annotated
+$dehydrated
+#if $assembly_level:
+--assembly_level $assembly_level
+#end if
+#if $assembly_source:
+--assembly_source $assembly_source
+#end if
+--chromosomes '$chromosomes'
+@EXCLUDES_GENOME@
+@INCLUDES_GENOME@
+$reference
+@RELEASED_BEFORE@
+@RELEASED_SINCE@
+#for search_term in $search:
+    --search '$search_term'
+#end for
+#if not $dehydrated:
+    && 7z x ncbi_dataset.zip
+#end if
+]]></command>
+    <inputs>
+        <conditional name="subcommand">
+            <param name="download_by" type="select" label="Choose how to find genomes to download">
+                <option value="accession">Download by NCBI assembly or BioProject accession</option>
+                <option value="taxon">Download by taxon</option>
+            </param>
+            <when value="accession">
+                <expand macro="text_or_file"/>
+            </when>
+            <when value="taxon">
+                <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param>
+            </when>
+        </conditional>
+        <expand macro="annotation"></expand>
+        <expand macro="dehydrated"></expand>
+        <expand macro="assembly_level"></expand>
+        <expand macro="assembly_source"></expand>
+        <expand macro="chromosomes"></expand>
+        <expand macro="excludes_genome"></expand>
+        <expand macro="includes_genome"></expand>
+        <expand macro="released_options"></expand>
+        <expand macro="released_options" before_or_after="since"></expand>
+        <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
+        <repeat name="search" title="Add search terms">
+            <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
+        </repeat>
+    </inputs>
+    <outputs>
+        <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip">
+            <filter>dehydrated</filter>
+        </data>
+        <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>not dehydrated and not exclude_seq</filter>
+        </collection>
+        <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>not dehydrated and not exclude_protein</filter>
+        </collection>
+        <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>not dehydrated and not exclude_gff3</filter>
+        </collection>
+        <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>not dehydrated and include_gtf</filter>
+        </collection>
+        <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>not dehydrated and include_gbff</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <test title="test dehydrated download by taxon">
+            <conditional name="subcommand">
+                <param name="download_by" value="taxon"></param>
+                <param name="text_or_file" value="text"></param>
+                <param name="taxon" value="human"></param>
+            </conditional>
+            <param name="chromosomes" value="21"></param>
+            <param name="dehydrated" value="true"/>
+            <param name="released_before" value="01/01/2018"></param>
+            <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/>
+        </test>
+        <test title="test download by comma-separated accession">
+            <conditional name="subcommand">
+                <param name="download_by" value="accession"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="GCF_000013305.1,GCF_000007445.1"></param>
+                </conditional>
+            </conditional>
+            <param name="dehydrated" value="false"/>
+            <param name="released_before" value="01/01/2007"></param>
+            <output_collection name="genome_fasta" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
+            </output_collection>
+            <output_collection name="protein_fasta" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            </output_collection>
+            <output_collection name="genomic_gff" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
+            </output_collection>
+        </test>
+        <test title="test download by accessions listed in file">
+            <conditional name="subcommand">
+                <param name="download_by" value="accession"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="file"></param>
+                    <param name="inputfile" value="accessions.txt"></param>
+                </conditional>
+            </conditional>
+            <param name="include_gbff" value="true"/>
+            <param name="include_gtf" value="true"/>
+            <param name="dehydrated" value="false"/>
+            <param name="released_before" value="01/01/2007"></param>
+            <output_collection name="genome_fasta" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
+            </output_collection>
+            <output_collection name="protein_fasta" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            </output_collection>
+            <output_collection name="genomic_gff" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
+            </output_collection>
+            <output_collection name="genomic_gtf" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
+            </output_collection>
+            <output_collection name="genomic_gbff" type="list">
+                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help>
+
+Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
+Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
+
+The default genome dataset includes the following files (if available):
+* genomic.fna (genomic sequences)
+* rna.fna (transcript sequences)
+* protein.faa (protein sequences)
+* genomic.gff (genome annotation in gff3 format)
+* data_report.jsonl (data report with genome assembly and annotation metadata)
+* dataset_catalog.json (a list of files and file types included in the dataset)
+    </help>
+
+</tool>
author	iuc
date	Thu, 15 Jul 2021 13:31:56 +0000
parents
children	2753a5786114