Mercurial > repos > iuc > ncbi_datasets
comparison datasets_genome.xml @ 3:c87df3f9e19d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
| author | iuc |
|---|---|
| date | Thu, 27 Jan 2022 08:20:15 +0000 |
| parents | 2753a5786114 |
| children | d64df2210624 |
comparison
equal
deleted
inserted
replaced
| 2:2753a5786114 | 3:c87df3f9e19d |
|---|---|
| 1 <tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> | 1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> |
| 2 <description>Download assembled genomes from NCBI</description> | 2 <description>download genome sequence, annotation and metadata</description> |
| 3 <macros> | 3 <macros> |
| 4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
| 5 </macros> | 5 </macros> |
| 6 <expand macro="requirements"></expand> | 6 <expand macro="requirements"></expand> |
| 7 <command><![CDATA[ | 7 <command><![CDATA[ |
| 8 @SETUP_CERTIFICATES@ | 8 @SETUP_CERTIFICATES@ |
| 9 datasets download genome $subcommand.download_by | 9 datasets download genome $query.subcommand.download_by |
| 10 #if $subcommand.download_by == 'accession': | 10 #if $query.subcommand.download_by == 'accession': |
| 11 #if $subcommand.text_or_file.text_or_file == 'text': | 11 #if $query.subcommand.text_or_file.text_or_file == 'text': |
| 12 #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) | 12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) |
| 13 #else | 13 #else |
| 14 --inputfile '$subcommand.text_or_file.inputfile' | 14 --inputfile '$query.subcommand.text_or_file.inputfile' |
| 15 #end if | 15 #end if |
| 16 #else: | 16 #else: |
| 17 '$subcommand.taxon' | 17 '$query.subcommand.taxon' |
| 18 #end if | 18 #end if |
| 19 $annotated | 19 $filters.reference |
| 20 $dehydrated | 20 $filters.annotated |
| 21 #if $assembly_level: | 21 #if $filters.assembly_level: |
| 22 --assembly_level $assembly_level | 22 --assembly_level $filters.assembly_level |
| 23 #end if | 23 #end if |
| 24 #if $assembly_source: | 24 #if $filters.assembly_source: |
| 25 --assembly_source $assembly_source | 25 --assembly_source $filters.assembly_source |
| 26 #end if | 26 #end if |
| 27 --chromosomes '$chromosomes' | 27 #if $filters.chromosomes: |
| 28 --chromosomes '$filters.chromosomes' | |
| 29 #end if | |
| 28 @EXCLUDES_GENOME@ | 30 @EXCLUDES_GENOME@ |
| 29 @INCLUDES_GENOME@ | 31 @INCLUDES_GENOME@ |
| 30 $reference | |
| 31 @RELEASED_BEFORE@ | 32 @RELEASED_BEFORE@ |
| 32 @RELEASED_SINCE@ | 33 @RELEASED_SINCE@ |
| 33 #for search_term in $search: | 34 #for search_term in $filters.search: |
| 34 --search '$search_term' | 35 --search '$filters.search_term' |
| 35 #end for | 36 #end for |
| 36 #if not $dehydrated: | 37 #if $uncompressed |
| 37 && 7z x ncbi_dataset.zip | 38 && unzip ncbi_dataset.zip |
| 39 #else | |
| 40 && unzip -l ncbi_dataset.zip > ncbi_dataset.txt | |
| 38 #end if | 41 #end if |
| 39 ]]></command> | 42 ]]></command> |
| 40 <inputs> | 43 <inputs> |
| 41 <conditional name="subcommand"> | 44 <section name="query" title="Query" expanded="true"> |
| 42 <param name="download_by" type="select" label="Choose how to find genomes to download"> | 45 <conditional name="subcommand"> |
| 43 <option value="accession">Download by NCBI assembly or BioProject accession</option> | 46 <param name="download_by" type="select" label="Choose how to find genomes to download"> |
| 44 <option value="taxon">Download by taxon</option> | 47 <option value="accession">Download by NCBI assembly or BioProject accession</option> |
| 45 </param> | 48 <option value="taxon">Download by taxon</option> |
| 46 <when value="accession"> | 49 </param> |
| 47 <expand macro="text_or_file"/> | 50 <when value="accession"> |
| 48 </when> | 51 <expand macro="text_or_file"/> |
| 49 <when value="taxon"> | 52 </when> |
| 50 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param> | 53 <when value="taxon"> |
| 51 </when> | 54 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param> |
| 52 </conditional> | 55 </when> |
| 53 <expand macro="annotation"></expand> | 56 </conditional> |
| 54 <expand macro="dehydrated"></expand> | 57 </section> |
| 55 <expand macro="assembly_level"></expand> | 58 <section name="filters" title="Filters and Limit"> |
| 56 <expand macro="assembly_source"></expand> | 59 <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> |
| 57 <expand macro="chromosomes"></expand> | 60 <expand macro="annotation"></expand> |
| 58 <expand macro="excludes_genome"></expand> | 61 <expand macro="assembly_level"></expand> |
| 59 <expand macro="includes_genome"></expand> | 62 <expand macro="assembly_source"></expand> |
| 60 <expand macro="released_options"></expand> | 63 <expand macro="chromosomes"></expand> |
| 61 <expand macro="released_options" before_or_after="since"></expand> | 64 <expand macro="released_options"></expand> |
| 62 <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> | 65 <expand macro="released_options" before_or_after="since"></expand> |
| 63 <repeat name="search" title="Add search terms"> | 66 |
| 64 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> | 67 <repeat name="search" title="Add search terms"> |
| 65 </repeat> | 68 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> |
| 69 </repeat> | |
| 70 </section> | |
| 71 <section name="file_choices" title="File Choices"> | |
| 72 <expand macro="excludes_genome"></expand> | |
| 73 <expand macro="includes_genome"></expand> | |
| 74 </section> | |
| 75 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> | |
| 66 </inputs> | 76 </inputs> |
| 67 <outputs> | 77 <outputs> |
| 68 <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip"> | 78 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> |
| 69 <filter>dehydrated</filter> | 79 <filter>not uncompressed</filter> |
| 70 </data> | 80 </data> |
| 71 <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list"> | 81 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> |
| 72 <discover_datasets pattern="(?P<identifier_0>.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | 82 <filter>not uncompressed</filter> |
| 73 <filter>not dehydrated and not exclude_seq</filter> | 83 </data> |
| 74 </collection> | 84 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> |
| 75 <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list"> | 85 <filter>uncompressed</filter> |
| 86 </data> | |
| 87 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> | |
| 88 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 89 <filter>uncompressed</filter> | |
| 90 </collection> | |
| 91 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list"> | |
| 92 <discover_datasets pattern="(?P<identifier_0>.*?)\/.*(?<!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 93 <filter>uncompressed and file_choices['exclude_seq']</filter> | |
| 94 </collection> | |
| 95 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> | |
| 96 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 97 <filter>uncompressed and file_choices['exclude_genomic_cds']</filter> | |
| 98 </collection> | |
| 99 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> | |
| 100 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 101 <filter>uncompressed and file_choices['exclude_gff3']</filter> | |
| 102 </collection> | |
| 103 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> | |
| 104 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 105 <filter>uncompressed and file_choices['exclude_rna']</filter> | |
| 106 </collection> | |
| 107 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> | |
| 76 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | 108 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> |
| 77 <filter>not dehydrated and not exclude_protein</filter> | 109 <filter>uncompressed and file_choices['exclude_protein']</filter> |
| 78 </collection> | 110 </collection> |
| 79 <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list"> | 111 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> |
| 80 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | 112 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> |
| 81 <filter>not dehydrated and not exclude_gff3</filter> | 113 <filter>uncompressed and file_choices['include_gbff']</filter> |
| 82 </collection> | 114 </collection> |
| 83 <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list"> | 115 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> |
| 84 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | 116 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> |
| 85 <filter>not dehydrated and include_gtf</filter> | 117 <filter>uncompressed and file_choices['include_gtf']</filter> |
| 86 </collection> | |
| 87 <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list"> | |
| 88 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> | |
| 89 <filter>not dehydrated and include_gbff</filter> | |
| 90 </collection> | 118 </collection> |
| 91 </outputs> | 119 </outputs> |
| 92 <tests> | 120 <tests> |
| 93 <test title="test dehydrated download by taxon"> | 121 <test expect_num_outputs="2"> |
| 94 <conditional name="subcommand"> | 122 <conditional name="query|subcommand"> |
| 95 <param name="download_by" value="taxon"></param> | 123 <param name="download_by" value="taxon"></param> |
| 96 <param name="text_or_file" value="text"></param> | 124 <param name="text_or_file" value="text"></param> |
| 97 <param name="taxon" value="human"></param> | 125 <param name="taxon" value="human"></param> |
| 98 </conditional> | 126 </conditional> |
| 99 <param name="chromosomes" value="21"></param> | 127 <param name="chromosomes" value="21"></param> |
| 100 <param name="dehydrated" value="true"/> | 128 <param name="uncompressed" value="false"/> |
| 101 <param name="released_before" value="01/01/2018"></param> | 129 <param name="released_before" value="01/01/2018"></param> |
| 102 <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/> | 130 <output name="archive_contents"> |
| 131 <assert_contents> | |
| 132 <has_text text="ncbi_dataset/data/dataset_catalog.json"/> | |
| 133 </assert_contents> | |
| 134 </output> | |
| 103 </test> | 135 </test> |
| 104 <test title="test download by comma-separated accession"> | 136 <test expect_num_outputs="5"> |
| 105 <conditional name="subcommand"> | 137 <conditional name="query|subcommand"> |
| 106 <param name="download_by" value="accession"></param> | 138 <param name="download_by" value="accession"></param> |
| 107 <conditional name="text_or_file"> | 139 <conditional name="text_or_file"> |
| 108 <param name="text_or_file" value="text"></param> | 140 <param name="text_or_file" value="text"></param> |
| 109 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param> | 141 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param> |
| 110 </conditional> | 142 </conditional> |
| 111 </conditional> | 143 </conditional> |
| 112 <param name="dehydrated" value="false"/> | 144 <param name="uncompressed" value="true"/> |
| 113 <param name="released_before" value="01/01/2007"></param> | 145 <param name="released_before" value="01/01/2007"></param> |
| 114 <output_collection name="genome_fasta" type="list"> | 146 <param name="exclude_genomic_cds" value="true"/> |
| 115 <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> | 147 <param name="include_gtf" value="true"/> |
| 116 <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> | 148 <output name="genome_data_report"> |
| 117 </output_collection> | 149 <assert_contents> |
| 118 <output_collection name="protein_fasta" type="list"> | 150 <has_text text="GCF_000013305.1"/> |
| 119 <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> | 151 </assert_contents> |
| 120 <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> | 152 </output> |
| 121 </output_collection> | 153 <output_collection name="sequence_report" type="list"> |
| 122 <output_collection name="genomic_gff" type="list"> | 154 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> |
| 123 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> | 155 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> |
| 124 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> | 156 </output_collection> |
| 157 <output_collection name="genomic_gtf" type="list"> | |
| 158 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> | |
| 159 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> | |
| 160 </output_collection> | |
| 161 <output_collection name="genomic_cds" type="list"> | |
| 162 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> | |
| 163 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> | |
| 125 </output_collection> | 164 </output_collection> |
| 126 </test> | 165 </test> |
| 127 <test title="test download by accessions listed in file"> | 166 <test expect_num_outputs="4"> |
| 128 <conditional name="subcommand"> | 167 <conditional name="query|subcommand"> |
| 129 <param name="download_by" value="accession"></param> | 168 <param name="download_by" value="accession"></param> |
| 130 <conditional name="text_or_file"> | 169 <conditional name="text_or_file"> |
| 131 <param name="text_or_file" value="file"></param> | 170 <param name="text_or_file" value="file"></param> |
| 132 <param name="inputfile" value="accessions.txt"></param> | 171 <param name="inputfile" value="accessions.txt"></param> |
| 133 </conditional> | 172 </conditional> |
| 134 </conditional> | 173 </conditional> |
| 135 <param name="include_gbff" value="true"/> | 174 <param name="include_gbff" value="true"/> |
| 136 <param name="include_gtf" value="true"/> | 175 <param name="exclude_seq" value="false"/> |
| 137 <param name="dehydrated" value="false"/> | 176 <param name="exclude_gff3" value="true"/> |
| 138 <param name="released_before" value="01/01/2007"></param> | 177 <param name="uncompressed" value="true"/> |
| 139 <output_collection name="genome_fasta" type="list"> | 178 <param name="released_before" value="01/02/2007"></param> |
| 140 <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> | 179 <output name="genome_data_report"> |
| 141 <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> | 180 <assert_contents> |
| 142 </output_collection> | 181 <has_text text="SAMN02604181"/> |
| 143 <output_collection name="protein_fasta" type="list"> | 182 </assert_contents> |
| 144 <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> | 183 </output> |
| 145 <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> | 184 <output_collection name="sequence_report" type="list"> |
| 185 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> | |
| 186 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> | |
| 146 </output_collection> | 187 </output_collection> |
| 147 <output_collection name="genomic_gff" type="list"> | 188 <output_collection name="genomic_gff" type="list"> |
| 148 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> | 189 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> |
| 149 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> | 190 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> |
| 150 </output_collection> | |
| 151 <output_collection name="genomic_gtf" type="list"> | |
| 152 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/> | |
| 153 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> | |
| 154 </output_collection> | 191 </output_collection> |
| 155 <output_collection name="genomic_gbff" type="list"> | 192 <output_collection name="genomic_gbff" type="list"> |
| 156 <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/> | 193 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> |
| 157 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/> | 194 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> |
| 158 </output_collection> | 195 </output_collection> |
| 159 </test> | 196 </test> |
| 160 </tests> | 197 </tests> |
| 161 <help> | 198 <help> |
| 199 <![CDATA[ | |
| 200 **Download Genome Datasets from NCBI** | |
| 162 | 201 |
| 163 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. | 202 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. |
| 164 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. | 203 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. |
| 165 | 204 |
| 166 The default genome dataset includes the following files (if available): | 205 Tthe default genome dataset includes the following files (if available): |
| 167 * genomic.fna (genomic sequences) | 206 * genomic.fna (genomic sequences) |
| 168 * rna.fna (transcript sequences) | 207 * rna.fna (transcript sequences) |
| 169 * protein.faa (protein sequences) | 208 * protein.faa (protein sequences) |
| 170 * genomic.gff (genome annotation in gff3 format) | 209 * genomic.gff (genome annotation in gff3 format) |
| 171 * data_report.jsonl (data report with genome assembly and annotation metadata) | 210 * data_report.jsonl (data report with genome assembly and annotation metadata) |
| 172 * dataset_catalog.json (a list of files and file types included in the dataset) | 211 * dataset_catalog.json (a list of files and file types included in the dataset) |
| 212 ]]> | |
| 173 </help> | 213 </help> |
| 174 | 214 |
| 175 </tool> | 215 </tool> |
