comparison datasets_genome.xml @ 3:c87df3f9e19d draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
author iuc
date Thu, 27 Jan 2022 08:20:15 +0000
parents 2753a5786114
children d64df2210624
comparison
equal deleted inserted replaced
2:2753a5786114 3:c87df3f9e19d
1 <tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> 1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
2 <description>Download assembled genomes from NCBI</description> 2 <description>download genome sequence, annotation and metadata</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements"></expand> 6 <expand macro="requirements"></expand>
7 <command><![CDATA[ 7 <command><![CDATA[
8 @SETUP_CERTIFICATES@ 8 @SETUP_CERTIFICATES@
9 datasets download genome $subcommand.download_by 9 datasets download genome $query.subcommand.download_by
10 #if $subcommand.download_by == 'accession': 10 #if $query.subcommand.download_by == 'accession':
11 #if $subcommand.text_or_file.text_or_file == 'text': 11 #if $query.subcommand.text_or_file.text_or_file == 'text':
12 #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) 12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
13 #else 13 #else
14 --inputfile '$subcommand.text_or_file.inputfile' 14 --inputfile '$query.subcommand.text_or_file.inputfile'
15 #end if 15 #end if
16 #else: 16 #else:
17 '$subcommand.taxon' 17 '$query.subcommand.taxon'
18 #end if 18 #end if
19 $annotated 19 $filters.reference
20 $dehydrated 20 $filters.annotated
21 #if $assembly_level: 21 #if $filters.assembly_level:
22 --assembly_level $assembly_level 22 --assembly_level $filters.assembly_level
23 #end if 23 #end if
24 #if $assembly_source: 24 #if $filters.assembly_source:
25 --assembly_source $assembly_source 25 --assembly_source $filters.assembly_source
26 #end if 26 #end if
27 --chromosomes '$chromosomes' 27 #if $filters.chromosomes:
28 --chromosomes '$filters.chromosomes'
29 #end if
28 @EXCLUDES_GENOME@ 30 @EXCLUDES_GENOME@
29 @INCLUDES_GENOME@ 31 @INCLUDES_GENOME@
30 $reference
31 @RELEASED_BEFORE@ 32 @RELEASED_BEFORE@
32 @RELEASED_SINCE@ 33 @RELEASED_SINCE@
33 #for search_term in $search: 34 #for search_term in $filters.search:
34 --search '$search_term' 35 --search '$filters.search_term'
35 #end for 36 #end for
36 #if not $dehydrated: 37 #if $uncompressed
37 && 7z x ncbi_dataset.zip 38 && unzip ncbi_dataset.zip
39 #else
40 && unzip -l ncbi_dataset.zip > ncbi_dataset.txt
38 #end if 41 #end if
39 ]]></command> 42 ]]></command>
40 <inputs> 43 <inputs>
41 <conditional name="subcommand"> 44 <section name="query" title="Query" expanded="true">
42 <param name="download_by" type="select" label="Choose how to find genomes to download"> 45 <conditional name="subcommand">
43 <option value="accession">Download by NCBI assembly or BioProject accession</option> 46 <param name="download_by" type="select" label="Choose how to find genomes to download">
44 <option value="taxon">Download by taxon</option> 47 <option value="accession">Download by NCBI assembly or BioProject accession</option>
45 </param> 48 <option value="taxon">Download by taxon</option>
46 <when value="accession"> 49 </param>
47 <expand macro="text_or_file"/> 50 <when value="accession">
48 </when> 51 <expand macro="text_or_file"/>
49 <when value="taxon"> 52 </when>
50 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param> 53 <when value="taxon">
51 </when> 54 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param>
52 </conditional> 55 </when>
53 <expand macro="annotation"></expand> 56 </conditional>
54 <expand macro="dehydrated"></expand> 57 </section>
55 <expand macro="assembly_level"></expand> 58 <section name="filters" title="Filters and Limit">
56 <expand macro="assembly_source"></expand> 59 <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
57 <expand macro="chromosomes"></expand> 60 <expand macro="annotation"></expand>
58 <expand macro="excludes_genome"></expand> 61 <expand macro="assembly_level"></expand>
59 <expand macro="includes_genome"></expand> 62 <expand macro="assembly_source"></expand>
60 <expand macro="released_options"></expand> 63 <expand macro="chromosomes"></expand>
61 <expand macro="released_options" before_or_after="since"></expand> 64 <expand macro="released_options"></expand>
62 <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> 65 <expand macro="released_options" before_or_after="since"></expand>
63 <repeat name="search" title="Add search terms"> 66
64 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> 67 <repeat name="search" title="Add search terms">
65 </repeat> 68 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
69 </repeat>
70 </section>
71 <section name="file_choices" title="File Choices">
72 <expand macro="excludes_genome"></expand>
73 <expand macro="includes_genome"></expand>
74 </section>
75 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
66 </inputs> 76 </inputs>
67 <outputs> 77 <outputs>
68 <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip"> 78 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
69 <filter>dehydrated</filter> 79 <filter>not uncompressed</filter>
70 </data> 80 </data>
71 <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list"> 81 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
72 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> 82 <filter>not uncompressed</filter>
73 <filter>not dehydrated and not exclude_seq</filter> 83 </data>
74 </collection> 84 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
75 <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list"> 85 <filter>uncompressed</filter>
86 </data>
87 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
88 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
89 <filter>uncompressed</filter>
90 </collection>
91 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list">
92 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*(?&lt;!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
93 <filter>uncompressed and file_choices['exclude_seq']</filter>
94 </collection>
95 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
96 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
97 <filter>uncompressed and file_choices['exclude_genomic_cds']</filter>
98 </collection>
99 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
100 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
101 <filter>uncompressed and file_choices['exclude_gff3']</filter>
102 </collection>
103 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
104 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
105 <filter>uncompressed and file_choices['exclude_rna']</filter>
106 </collection>
107 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
76 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> 108 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
77 <filter>not dehydrated and not exclude_protein</filter> 109 <filter>uncompressed and file_choices['exclude_protein']</filter>
78 </collection> 110 </collection>
79 <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list"> 111 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
80 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> 112 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
81 <filter>not dehydrated and not exclude_gff3</filter> 113 <filter>uncompressed and file_choices['include_gbff']</filter>
82 </collection> 114 </collection>
83 <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list"> 115 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
84 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> 116 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
85 <filter>not dehydrated and include_gtf</filter> 117 <filter>uncompressed and file_choices['include_gtf']</filter>
86 </collection>
87 <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list">
88 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
89 <filter>not dehydrated and include_gbff</filter>
90 </collection> 118 </collection>
91 </outputs> 119 </outputs>
92 <tests> 120 <tests>
93 <test title="test dehydrated download by taxon"> 121 <test expect_num_outputs="2">
94 <conditional name="subcommand"> 122 <conditional name="query|subcommand">
95 <param name="download_by" value="taxon"></param> 123 <param name="download_by" value="taxon"></param>
96 <param name="text_or_file" value="text"></param> 124 <param name="text_or_file" value="text"></param>
97 <param name="taxon" value="human"></param> 125 <param name="taxon" value="human"></param>
98 </conditional> 126 </conditional>
99 <param name="chromosomes" value="21"></param> 127 <param name="chromosomes" value="21"></param>
100 <param name="dehydrated" value="true"/> 128 <param name="uncompressed" value="false"/>
101 <param name="released_before" value="01/01/2018"></param> 129 <param name="released_before" value="01/01/2018"></param>
102 <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/> 130 <output name="archive_contents">
131 <assert_contents>
132 <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
133 </assert_contents>
134 </output>
103 </test> 135 </test>
104 <test title="test download by comma-separated accession"> 136 <test expect_num_outputs="5">
105 <conditional name="subcommand"> 137 <conditional name="query|subcommand">
106 <param name="download_by" value="accession"></param> 138 <param name="download_by" value="accession"></param>
107 <conditional name="text_or_file"> 139 <conditional name="text_or_file">
108 <param name="text_or_file" value="text"></param> 140 <param name="text_or_file" value="text"></param>
109 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param> 141 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param>
110 </conditional> 142 </conditional>
111 </conditional> 143 </conditional>
112 <param name="dehydrated" value="false"/> 144 <param name="uncompressed" value="true"/>
113 <param name="released_before" value="01/01/2007"></param> 145 <param name="released_before" value="01/01/2007"></param>
114 <output_collection name="genome_fasta" type="list"> 146 <param name="exclude_genomic_cds" value="true"/>
115 <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> 147 <param name="include_gtf" value="true"/>
116 <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> 148 <output name="genome_data_report">
117 </output_collection> 149 <assert_contents>
118 <output_collection name="protein_fasta" type="list"> 150 <has_text text="GCF_000013305.1"/>
119 <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> 151 </assert_contents>
120 <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> 152 </output>
121 </output_collection> 153 <output_collection name="sequence_report" type="list">
122 <output_collection name="genomic_gff" type="list"> 154 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
123 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> 155 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
124 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> 156 </output_collection>
157 <output_collection name="genomic_gtf" type="list">
158 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
159 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
160 </output_collection>
161 <output_collection name="genomic_cds" type="list">
162 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
163 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
125 </output_collection> 164 </output_collection>
126 </test> 165 </test>
127 <test title="test download by accessions listed in file"> 166 <test expect_num_outputs="4">
128 <conditional name="subcommand"> 167 <conditional name="query|subcommand">
129 <param name="download_by" value="accession"></param> 168 <param name="download_by" value="accession"></param>
130 <conditional name="text_or_file"> 169 <conditional name="text_or_file">
131 <param name="text_or_file" value="file"></param> 170 <param name="text_or_file" value="file"></param>
132 <param name="inputfile" value="accessions.txt"></param> 171 <param name="inputfile" value="accessions.txt"></param>
133 </conditional> 172 </conditional>
134 </conditional> 173 </conditional>
135 <param name="include_gbff" value="true"/> 174 <param name="include_gbff" value="true"/>
136 <param name="include_gtf" value="true"/> 175 <param name="exclude_seq" value="false"/>
137 <param name="dehydrated" value="false"/> 176 <param name="exclude_gff3" value="true"/>
138 <param name="released_before" value="01/01/2007"></param> 177 <param name="uncompressed" value="true"/>
139 <output_collection name="genome_fasta" type="list"> 178 <param name="released_before" value="01/02/2007"></param>
140 <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> 179 <output name="genome_data_report">
141 <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> 180 <assert_contents>
142 </output_collection> 181 <has_text text="SAMN02604181"/>
143 <output_collection name="protein_fasta" type="list"> 182 </assert_contents>
144 <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> 183 </output>
145 <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> 184 <output_collection name="sequence_report" type="list">
185 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
186 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
146 </output_collection> 187 </output_collection>
147 <output_collection name="genomic_gff" type="list"> 188 <output_collection name="genomic_gff" type="list">
148 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> 189 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
149 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> 190 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
150 </output_collection>
151 <output_collection name="genomic_gtf" type="list">
152 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/>
153 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
154 </output_collection> 191 </output_collection>
155 <output_collection name="genomic_gbff" type="list"> 192 <output_collection name="genomic_gbff" type="list">
156 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/> 193 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
157 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/> 194 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
158 </output_collection> 195 </output_collection>
159 </test> 196 </test>
160 </tests> 197 </tests>
161 <help> 198 <help>
199 <![CDATA[
200 **Download Genome Datasets from NCBI**
162 201
163 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. 202 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
164 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. 203 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
165 204
166 The default genome dataset includes the following files (if available): 205 Tthe default genome dataset includes the following files (if available):
167 * genomic.fna (genomic sequences) 206 * genomic.fna (genomic sequences)
168 * rna.fna (transcript sequences) 207 * rna.fna (transcript sequences)
169 * protein.faa (protein sequences) 208 * protein.faa (protein sequences)
170 * genomic.gff (genome annotation in gff3 format) 209 * genomic.gff (genome annotation in gff3 format)
171 * data_report.jsonl (data report with genome assembly and annotation metadata) 210 * data_report.jsonl (data report with genome assembly and annotation metadata)
172 * dataset_catalog.json (a list of files and file types included in the dataset) 211 * dataset_catalog.json (a list of files and file types included in the dataset)
212 ]]>
173 </help> 213 </help>
174 214
175 </tool> 215 </tool>