Mercurial > repos > iuc > ncbi_datasets
comparison datasets_genome.xml @ 14:a222b4d3d52e draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit d3fa7b70aa028f527a1dbbb210c172c637dfd4d9
| author | iuc |
|---|---|
| date | Fri, 09 Dec 2022 15:11:04 +0000 |
| parents | d979ba07ddd4 |
| children | dfad868c911b |
comparison
equal
deleted
inserted
replaced
| 13:d979ba07ddd4 | 14:a222b4d3d52e |
|---|---|
| 1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> | 1 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> |
| 2 <description>download genome sequence, annotation and metadata</description> | 2 <description>download genome sequence, annotation and metadata</description> |
| 3 <macros> | 3 <macros> |
| 4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
| 5 </macros> | 5 </macros> |
| 6 <expand macro="bio_tools"/> | |
| 6 <expand macro="requirements"></expand> | 7 <expand macro="requirements"></expand> |
| 8 <expand macro="version_command"/> | |
| 7 <command><![CDATA[ | 9 <command><![CDATA[ |
| 10 #import re | |
| 8 @SETUP_CERTIFICATES@ | 11 @SETUP_CERTIFICATES@ |
| 9 datasets download genome $query.subcommand.download_by | 12 datasets download genome $query.subcommand.download_by |
| 10 #if $query.subcommand.download_by == 'accession': | 13 #if $query.subcommand.download_by == 'accession': |
| 11 #if $query.subcommand.text_or_file.text_or_file == 'text': | 14 #if $query.subcommand.text_or_file.text_or_file == 'text': |
| 12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) | 15 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x) |
| 13 #else | 16 #else |
| 14 --inputfile '$query.subcommand.text_or_file.inputfile' | 17 --inputfile '$query.subcommand.text_or_file.inputfile' |
| 15 #end if | 18 #end if |
| 16 #else: | 19 #else: |
| 17 '$query.subcommand.taxon' | 20 '$query.subcommand.taxon_positional' |
| 18 $query.subcommand.tax_exact_match | 21 $query.subcommand.tax_exact_match |
| 19 #end if | 22 #end if |
| 20 $filters.reference | 23 $filters.reference |
| 21 $filters.annotated | 24 $filters.annotated |
| 22 #if $filters.assembly_level: | 25 #if $filters.assembly_level: |
| 35 @RELEASED_AFTER@ | 38 @RELEASED_AFTER@ |
| 36 #for search_term in $filters.search: | 39 #for search_term in $filters.search: |
| 37 --search '$filters.search_term' | 40 --search '$filters.search_term' |
| 38 #end for | 41 #end for |
| 39 --no-progressbar | 42 --no-progressbar |
| 40 #if $uncompressed | 43 --dehydrated |
| 41 && 7z x -y ncbi_dataset.zip | 44 |
| 42 #else | 45 ## produce TSV report file |
| 43 && 7z l ncbi_dataset.zip > ncbi_dataset.txt | 46 && dataformat tsv genome |
| 47 --package ncbi_dataset.zip | |
| 48 --fields #echo ",".join($file_choices.report_columns) | |
| 49 > genome_data_report.tsv | |
| 50 | |
| 51 ## unzip and rehydrate if any data is to be downloaded (include is not None) | |
| 52 #if $file_choices.include | |
| 53 ## unzip | |
| 54 && 7z x -y ncbi_dataset.zip > 7z.log | |
| 55 | |
| 56 ## rehydrate | |
| 57 && datasets rehydrate | |
| 58 --directory ./ | |
| 59 #if not $file_choices.decompress | |
| 60 --gzip | |
| 61 #end if | |
| 62 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} | |
| 63 | |
| 64 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery | |
| 65 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; | |
| 66 | |
| 67 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) | |
| 68 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip) | |
| 69 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression | |
| 70 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; | |
| 71 #if not $file_choices.decompress | |
| 72 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; | |
| 73 #end if | |
| 74 | |
| 75 #if "seq-report" in $file_choices.include | |
| 76 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; | |
| 77 #end if | |
| 78 | |
| 79 && true ## because Galaxy removes trailing ; from command | |
| 44 #end if | 80 #end if |
| 45 ]]></command> | 81 ]]></command> |
| 46 <inputs> | 82 <inputs> |
| 47 <section name="query" title="Query" expanded="true"> | 83 <section name="query" title="Query" expanded="true"> |
| 48 <conditional name="subcommand"> | 84 <conditional name="subcommand"> |
| 49 <param name="download_by" type="select" label="Choose how to find genomes to download"> | 85 <param name="download_by" type="select" label="Choose how to find genomes to download"> |
| 50 <option value="accession">Download by NCBI assembly or BioProject accession</option> | 86 <option value="accession">By NCBI assembly or BioProject accession</option> |
| 51 <option value="taxon">Download by taxon</option> | 87 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option> |
| 52 </param> | 88 </param> |
| 53 <when value="accession"> | 89 <when value="accession"> |
| 54 <expand macro="text_or_file"/> | 90 <expand macro="text_or_file"/> |
| 55 </when> | 91 </when> |
| 56 <when value="taxon"> | 92 <when value="taxon"> |
| 57 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/> | 93 <expand macro="taxon_positional"/> |
| 58 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> | 94 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> |
| 59 </when> | 95 </when> |
| 60 </conditional> | 96 </conditional> |
| 61 </section> | 97 </section> |
| 62 <section name="filters" title="Filters and Limit"> | 98 <section name="filters" title="Filters and Limit"> |
| 65 <expand macro="assembly_level"/> | 101 <expand macro="assembly_level"/> |
| 66 <param argument="--assembly-version" type="select" label="Assembly version(s)"> | 102 <param argument="--assembly-version" type="select" label="Assembly version(s)"> |
| 67 <option value="latest">Latest</option> | 103 <option value="latest">Latest</option> |
| 68 <option value="all">All</option> | 104 <option value="all">All</option> |
| 69 </param> | 105 </param> |
| 70 <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank--> | |
| 71 <expand macro="assembly_source"/> | 106 <expand macro="assembly_source"/> |
| 72 <expand macro="chromosomes"/> | 107 <expand macro="chromosomes"/> |
| 73 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> | 108 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> |
| 74 <expand macro="released_options"/> | 109 <expand macro="released_options"/> |
| 75 <expand macro="released_options" before_or_after="after"/> | 110 <expand macro="released_options" before_or_after="after"/> |
| 76 | 111 |
| 77 <repeat name="search" title="Add search terms"> | 112 <repeat name="search" title="Add search terms"> |
| 78 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> | 113 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> |
| 79 </repeat> | 114 </repeat> |
| 80 </section> | 115 </section> |
| 81 <section name="file_choices" title="File Choices" expanded="true"> | 116 <section name="file_choices" title="Output options" expanded="true"> |
| 82 <expand macro="include"/> | 117 <expand macro="tsv_report_columns"> |
| 118 <option value="accession" selected="true">accession</option> | |
| 119 <option value="organism-name" selected="true">organism-name</option> | |
| 120 <option value="assminfo-submitter" selected="true">assminfo-submitter</option> | |
| 121 <option value="assminfo-name" selected="true">assminfo-name</option> | |
| 122 </expand> | |
| 123 <expand macro="include"> | |
| 124 <expand macro="genome_includes"/> | |
| 125 </expand> | |
| 126 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/> | |
| 83 </section> | 127 </section> |
| 84 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> | |
| 85 </inputs> | 128 </inputs> |
| 86 <outputs> | 129 <outputs> |
| 87 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> | 130 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> |
| 88 <filter>not uncompressed</filter> | |
| 89 </data> | |
| 90 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> | |
| 91 <filter>not uncompressed</filter> | |
| 92 </data> | |
| 93 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> | |
| 94 <filter>uncompressed</filter> | |
| 95 </data> | |
| 96 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> | 131 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> |
| 97 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 132 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 98 <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter> | 133 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> |
| 99 </collection> | 134 </collection> |
| 100 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> | 135 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> |
| 101 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 136 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 102 <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter> | 137 <filter>file_choices['include'] and "genome" in file_choices['include']</filter> |
| 103 </collection> | 138 </collection> |
| 104 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> | 139 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> |
| 105 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 140 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 106 <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter> | 141 <filter>file_choices['include'] and "rna" in file_choices['include']</filter> |
| 107 </collection> | 142 </collection> |
| 108 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> | 143 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> |
| 109 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 144 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 110 <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter> | 145 <filter>file_choices['include'] and "protein" in file_choices['include']</filter> |
| 111 </collection> | 146 </collection> |
| 112 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> | 147 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> |
| 113 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 148 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 114 <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter> | 149 <filter>file_choices['include'] and "cds" in file_choices['include']</filter> |
| 115 </collection> | 150 </collection> |
| 116 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> | 151 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> |
| 117 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 152 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 118 <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter> | 153 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> |
| 119 </collection> | 154 </collection> |
| 120 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> | 155 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> |
| 121 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 156 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 122 <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter> | 157 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> |
| 123 </collection> | 158 </collection> |
| 124 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> | 159 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> |
| 125 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 160 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
| 126 <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter> | 161 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> |
| 127 </collection> | 162 </collection> |
| 128 </outputs> | 163 </outputs> |
| 129 <tests> | 164 <tests> |
| 165 <!-- Note: All but one test use the non-default decompress="true" | |
| 166 | |
| 167 this is because (at 11/22) Galaxy can not apply text assertions on the content | |
| 168 of compressed files https://github.com/galaxyproject/galaxy/pull/15085 | |
| 169 | |
| 170 So with decompress="true" more powerfull assertions are powerful. | |
| 171 A single test checks the default, ie decompress="false". | |
| 172 --> | |
| 173 <test expect_num_outputs="3"> | |
| 174 <conditional name="query|subcommand"> | |
| 175 <param name="download_by" value="taxon"/> | |
| 176 <param name="taxon_positional" value="human"/> | |
| 177 </conditional> | |
| 178 <param name="chromosomes" value="21"/> | |
| 179 <param name="released_before" value="01/01/2018"/> | |
| 180 <section name="file_choices"> | |
| 181 <!-- include a sequence (which should be downloaded as fasta.gz) | |
| 182 and one non-sequence (which should be decompressed) output --> | |
| 183 <param name="include" value="rna,gff3"/> | |
| 184 </section> | |
| 185 <output name="genome_data_report"> | |
| 186 <assert_contents> | |
| 187 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> | |
| 188 <has_n_lines n="144"/> | |
| 189 <has_n_columns n="4"/> | |
| 190 </assert_contents> | |
| 191 </output> | |
| 192 <output_collection name="rna_fasta" type="list" count="1"> | |
| 193 <element name="GCF_000306695.2" decompress="true"> | |
| 194 <assert_contents> | |
| 195 <has_text text=">"/> | |
| 196 </assert_contents> | |
| 197 </element> | |
| 198 </output_collection> | |
| 199 <output_collection name="genomic_gff" type="list"> | |
| 200 <element name="GCF_000306695.2"> | |
| 201 <assert_contents> | |
| 202 <has_n_lines min="1000000"/> | |
| 203 <has_line line="##gff-version 3"/> | |
| 204 <!-- TODO this will only work when the galaxy python packakes for 22.05 have been released | |
| 205 <has_n_columns n="9" comment="#"/> --> | |
| 206 </assert_contents> | |
| 207 </element> | |
| 208 </output_collection> | |
| 209 <assert_command> | |
| 210 <has_text text="gunzip"/> | |
| 211 </assert_command> | |
| 212 </test> | |
| 130 <test expect_num_outputs="2"> | 213 <test expect_num_outputs="2"> |
| 131 <conditional name="query|subcommand"> | 214 <conditional name="query|subcommand"> |
| 132 <param name="download_by" value="taxon"/> | 215 <param name="download_by" value="taxon"/> |
| 133 <param name="text_or_file" value="text"/> | 216 <param name="taxon_positional" value="human"/> |
| 134 <param name="taxon" value="human"/> | |
| 135 </conditional> | 217 </conditional> |
| 136 <param name="chromosomes" value="21"/> | 218 <param name="chromosomes" value="21"/> |
| 137 <param name="include" value=""/> | |
| 138 <param name="uncompressed" value="false"/> | |
| 139 <param name="released_before" value="01/01/2018"/> | |
| 140 <output name="archive_contents"> | |
| 141 <assert_contents> | |
| 142 <has_text text="ncbi_dataset/data/dataset_catalog.json"/> | |
| 143 </assert_contents> | |
| 144 </output> | |
| 145 </test> | |
| 146 <test expect_num_outputs="2"> | |
| 147 <conditional name="query|subcommand"> | |
| 148 <param name="download_by" value="taxon"/> | |
| 149 <param name="text_or_file" value="text"/> | |
| 150 <param name="taxon" value="human"/> | |
| 151 </conditional> | |
| 152 <param name="chromosomes" value="21"/> | |
| 153 <param name="include" value="genome"/> | |
| 154 <param name="uncompressed" value="true"/> | |
| 155 <param name="assembly_level" value="chromosome,complete"/> | 219 <param name="assembly_level" value="chromosome,complete"/> |
| 156 <param name="released_before" value="01/01/2018"/> | 220 <param name="released_before" value="01/01/2018"/> |
| 221 <section name="file_choices"> | |
| 222 <param name="include" value="genome"/> | |
| 223 <param name="decompress" value="true"/> | |
| 224 </section> | |
| 157 <output_collection name="genome_fasta" type="list:list" count="14"> | 225 <output_collection name="genome_fasta" type="list:list" count="14"> |
| 158 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> | 226 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> |
| 159 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> | 227 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> |
| 160 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> | 228 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> |
| 161 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> | 229 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> |
| 172 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> | 240 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> |
| 173 </output_collection> | 241 </output_collection> |
| 174 <output name="genome_data_report"> | 242 <output name="genome_data_report"> |
| 175 <assert_contents> | 243 <assert_contents> |
| 176 <has_text text="Homo sapiens"/> | 244 <has_text text="Homo sapiens"/> |
| 177 </assert_contents> | 245 <has_n_columns n="4"/> |
| 178 </output> | 246 </assert_contents> |
| 179 </test> | 247 </output> |
| 180 <!-- same as precious test but assembly_source (refseq which removes some of the genomes) --> | 248 </test> |
| 249 <!-- same as previous test but assembly_source (refseq which removes some of the genomes) --> | |
| 181 <test expect_num_outputs="2"> | 250 <test expect_num_outputs="2"> |
| 182 <conditional name="query|subcommand"> | 251 <conditional name="query|subcommand"> |
| 183 <param name="download_by" value="taxon"/> | 252 <param name="download_by" value="taxon"/> |
| 184 <param name="text_or_file" value="text"/> | 253 <param name="taxon_positional" value="human"/> |
| 185 <param name="taxon" value="human"/> | |
| 186 </conditional> | 254 </conditional> |
| 187 <param name="chromosomes" value="21"/> | 255 <param name="chromosomes" value="21"/> |
| 188 <param name="include" value="genome"/> | |
| 189 <param name="uncompressed" value="true"/> | |
| 190 <param name="assembly_level" value="chromosome,complete"/> | 256 <param name="assembly_level" value="chromosome,complete"/> |
| 191 <param name="assembly_source" value="refseq"/> | 257 <param name="assembly_source" value="refseq"/> |
| 192 <param name="released_before" value="01/01/2018"/> | 258 <param name="released_before" value="01/01/2018"/> |
| 259 <section name="file_choices"> | |
| 260 <param name="include" value="genome"/> | |
| 261 <param name="decompress" value="true"/> | |
| 262 </section> | |
| 193 <output_collection name="genome_fasta" type="list:list" count="2"> | 263 <output_collection name="genome_fasta" type="list:list" count="2"> |
| 194 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> | 264 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> |
| 195 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> | 265 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> |
| 196 </output_collection> | 266 </output_collection> |
| 197 <output name="genome_data_report"> | 267 <output name="genome_data_report"> |
| 198 <assert_contents> | 268 <assert_contents> |
| 199 <has_text text="Homo sapiens"/> | 269 <has_text text="Homo sapiens"/> |
| 270 <has_n_lines n="5"/> | |
| 271 <has_n_columns n="4"/> | |
| 200 </assert_contents> | 272 </assert_contents> |
| 201 </output> | 273 </output> |
| 202 </test> | 274 </test> |
| 203 <test expect_num_outputs="4"> | 275 <test expect_num_outputs="4"> |
| 204 <conditional name="query|subcommand"> | 276 <conditional name="query|subcommand"> |
| 206 <conditional name="text_or_file"> | 278 <conditional name="text_or_file"> |
| 207 <param name="text_or_file" value="text"/> | 279 <param name="text_or_file" value="text"/> |
| 208 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> | 280 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> |
| 209 </conditional> | 281 </conditional> |
| 210 </conditional> | 282 </conditional> |
| 211 <param name="include" value="seq-report,gtf,cds"/> | |
| 212 <param name="uncompressed" value="true"/> | |
| 213 <param name="released_before" value="01/01/2007"/> | 283 <param name="released_before" value="01/01/2007"/> |
| 284 <section name="file_choices"> | |
| 285 <param name="include" value="seq-report,gtf,cds"/> | |
| 286 <param name="decompress" value="true"/> | |
| 287 </section> | |
| 214 <output name="genome_data_report"> | 288 <output name="genome_data_report"> |
| 215 <assert_contents> | 289 <assert_contents> |
| 216 <has_text text="GCF_000013305.1"/> | 290 <has_text text="GCF_000013305.1"/> |
| 217 </assert_contents> | 291 <has_n_lines n="3"/> |
| 218 </output> | 292 <has_n_columns n="4"/> |
| 293 </assert_contents> | |
| 294 </output> | |
| 295 <output_collection name="sequence_report" type="list" count="2" > | |
| 296 <element name="GCF_000007445.1"> | |
| 297 <assert_contents> | |
| 298 <has_text text="GCF_000007445.1"/> | |
| 299 <has_n_lines n="2"/> | |
| 300 <has_n_columns n="14"/> | |
| 301 </assert_contents> | |
| 302 </element> | |
| 303 <element name="GCF_000013305.1"> | |
| 304 <assert_contents> | |
| 305 <has_text text="GCF_000013305.1"/> | |
| 306 <has_n_lines n="2"/> | |
| 307 <has_n_columns n="14"/> | |
| 308 </assert_contents> | |
| 309 </element> | |
| 310 </output_collection> | |
| 219 <output_collection name="genomic_gtf" type="list"> | 311 <output_collection name="genomic_gtf" type="list"> |
| 220 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> | 312 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> |
| 221 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> | 313 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> |
| 222 </output_collection> | 314 </output_collection> |
| 223 <output_collection name="genomic_cds" type="list"> | 315 <output_collection name="genomic_cds" type="list"> |
| 224 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> | 316 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/> |
| 225 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> | 317 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/> |
| 226 </output_collection> | 318 </output_collection> |
| 227 </test> | 319 </test> |
| 228 <test expect_num_outputs="4"> | 320 <test expect_num_outputs="4"> |
| 229 <conditional name="query|subcommand"> | 321 <conditional name="query|subcommand"> |
| 230 <param name="download_by" value="accession"/> | 322 <param name="download_by" value="accession"/> |
| 231 <conditional name="text_or_file"> | 323 <conditional name="text_or_file"> |
| 232 <param name="text_or_file" value="file"/> | 324 <param name="text_or_file" value="file"/> |
| 233 <param name="inputfile" value="accessions.txt"/> | 325 <param name="inputfile" value="accessions.txt"/> |
| 234 </conditional> | 326 </conditional> |
| 235 </conditional> | 327 </conditional> |
| 236 <param name="include" value="seq-report,gbff,gff3"/> | |
| 237 <param name="uncompressed" value="true"/> | |
| 238 <param name="released_before" value="01/01/2007"/> | 328 <param name="released_before" value="01/01/2007"/> |
| 239 <output name="genome_data_report"> | 329 <section name="file_choices"> |
| 240 <assert_contents> | 330 <param name="include" value="seq-report,gff3,gbff"/> |
| 241 <has_text text="SAMN02604181"/> | 331 <param name="decompress" value="true"/> |
| 332 </section> | |
| 333 <output name="genome_data_report"> | |
| 334 <assert_contents> | |
| 335 <has_text text="GCF_000013305.1"/> | |
| 336 <has_text text="GCF_000007445.1"/> | |
| 337 <has_n_lines n="3"/> | |
| 338 <has_n_columns n="4"/> | |
| 242 </assert_contents> | 339 </assert_contents> |
| 243 </output> | 340 </output> |
| 244 <output_collection name="genomic_gff" type="list"> | 341 <output_collection name="genomic_gff" type="list"> |
| 245 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> | 342 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> |
| 246 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> | 343 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> |
| 248 <output_collection name="genomic_gbff" type="list"> | 345 <output_collection name="genomic_gbff" type="list"> |
| 249 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> | 346 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> |
| 250 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> | 347 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> |
| 251 </output_collection> | 348 </output_collection> |
| 252 </test> | 349 </test> |
| 253 <test expect_num_outputs="2"> | 350 |
| 351 <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> | |
| 352 <test expect_num_outputs="2" expect_failure="true"> | |
| 254 <conditional name="query|subcommand"> | 353 <conditional name="query|subcommand"> |
| 255 <param name="download_by" value="accession"/> | 354 <param name="download_by" value="accession"/> |
| 256 <conditional name="text_or_file"> | 355 <conditional name="text_or_file"> |
| 257 <param name="text_or_file" value="text"/> | 356 <param name="text_or_file" value="text"/> |
| 258 <param name="accession" value="GCF_000001405"/> | 357 <param name="accession" value="GCF_000001405"/> |
| 259 </conditional> | 358 </conditional> |
| 260 </conditional> | 359 </conditional> |
| 261 <param name="include" value="seq-report"/> | |
| 262 <param name="uncompressed" value="true"/> | |
| 263 <param name="released_before" value="01/01/2015"/> | 360 <param name="released_before" value="01/01/2015"/> |
| 264 <param name="assembly_version" value="all"/> | 361 <param name="assembly_version" value="all"/> |
| 265 <output_collection name="sequence_report" count="4"> | 362 <section name="file_choices"> |
| 266 <element name="GCF_000001405.25"> | 363 <param name="include" value="seq-report"/> |
| 267 <assert_contents> | 364 </section> |
| 268 <has_text text="assignedMoleculeLocationType"/> | 365 <!-- |
| 269 </assert_contents> | 366 <output_collection name="sequence_report" type="list" count="4" > |
| 270 </element> | 367 --> |
| 271 <element name="GCF_000001405.26"> | |
| 272 <assert_contents> | |
| 273 <has_text text="assignedMoleculeLocationType"/> | |
| 274 </assert_contents> | |
| 275 </element> | |
| 276 <element name="GCF_000001405.27"> | |
| 277 <assert_contents> | |
| 278 <has_text text="assignedMoleculeLocationType"/> | |
| 279 </assert_contents> | |
| 280 </element> | |
| 281 <element name="GCF_000001405.28"> | |
| 282 <assert_contents> | |
| 283 <has_text text="assignedMoleculeLocationType"/> | |
| 284 </assert_contents> | |
| 285 </element> | |
| 286 </output_collection> | |
| 287 </test> | 368 </test> |
| 288 <test expect_num_outputs="5"> | 369 <test expect_num_outputs="5"> |
| 289 <conditional name="query|subcommand"> | 370 <conditional name="query|subcommand"> |
| 290 <param name="download_by" value="accession"/> | 371 <param name="download_by" value="accession"/> |
| 291 <conditional name="text_or_file"> | 372 <conditional name="text_or_file"> |
| 292 <param name="text_or_file" value="text"/> | 373 <param name="text_or_file" value="text"/> |
| 293 <param name="accession" value="GCF_000146045.2"/> | 374 <param name="accession" value="GCF_000146045.2"/> |
| 294 </conditional> | 375 </conditional> |
| 295 </conditional> | 376 </conditional> |
| 296 <param name="include" value="seq-report,genome,rna,cds"/> | 377 <section name="file_choices"> |
| 297 <param name="uncompressed" value="true"/> | 378 <param name="include" value="genome,protein,rna,cds"/> |
| 379 <param name="decompress" value="true"/> | |
| 380 </section> | |
| 298 <output_collection name="genome_fasta" type="list:list" count="1"> | 381 <output_collection name="genome_fasta" type="list:list" count="1"> |
| 299 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> | 382 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> |
| 300 </output_collection> | 383 </output_collection> |
| 384 <output_collection name="protein_fasta" type="list" count="1"> | |
| 385 <element name="GCF_000146045.2" decompress="true"> | |
| 386 <assert_contents> | |
| 387 <has_text text=">"/> | |
| 388 </assert_contents> | |
| 389 </element> | |
| 390 </output_collection> | |
| 391 <output_collection name="rna_fasta" type="list" count="1"> | |
| 392 <element name="GCF_000146045.2" decompress="true"> | |
| 393 <assert_contents> | |
| 394 <has_text text=">"/> | |
| 395 </assert_contents> | |
| 396 </element> | |
| 397 </output_collection> | |
| 398 </test> | |
| 399 <!-- same as the previous test, but use the default value for decompress, | |
| 400 see comment at the beginning of the tests --> | |
| 401 <test expect_num_outputs="5"> | |
| 402 <conditional name="query|subcommand"> | |
| 403 <param name="download_by" value="accession"/> | |
| 404 <conditional name="text_or_file"> | |
| 405 <param name="text_or_file" value="text"/> | |
| 406 <param name="accession" value="GCF_000146045.2"/> | |
| 407 </conditional> | |
| 408 </conditional> | |
| 409 <section name="file_choices"> | |
| 410 <param name="include" value="genome,protein,rna,cds"/> | |
| 411 </section> | |
| 412 <output_collection name="genome_fasta" type="list:list" count="1"> | |
| 413 <element name="GCF_000146045.2"> | |
| 414 <element name="GCF_000146045.2_R64" ftype="fasta.gz"> | |
| 415 <assert_contents> | |
| 416 <has_size value="3843460"/> | |
| 417 </assert_contents> | |
| 418 </element> | |
| 419 </element> | |
| 420 </output_collection> | |
| 421 <output_collection name="protein_fasta" type="list" count="1"> | |
| 422 <element name="GCF_000146045.2" ftype="fasta.gz"> | |
| 423 <assert_contents> | |
| 424 <has_size value="1844838"/> | |
| 425 </assert_contents> | |
| 426 </element> | |
| 427 </output_collection> | |
| 428 <output_collection name="rna_fasta" type="list" count="1"> | |
| 429 <element name="GCF_000146045.2" ftype="fasta.gz"> | |
| 430 <assert_contents> | |
| 431 <has_size value="2784534"/> | |
| 432 </assert_contents> | |
| 433 </element> | |
| 434 </output_collection> | |
| 301 </test> | 435 </test> |
| 302 <test expect_num_outputs="3"> | 436 <test expect_num_outputs="3"> |
| 303 <conditional name="query|subcommand"> | 437 <conditional name="query|subcommand"> |
| 304 <param name="download_by" value="accession"/> | 438 <param name="download_by" value="accession"/> |
| 305 <conditional name="text_or_file"> | 439 <conditional name="text_or_file"> |
| 306 <param name="text_or_file" value="text"/> | 440 <param name="text_or_file" value="text"/> |
| 307 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> | 441 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> |
| 308 </conditional> | 442 </conditional> |
| 309 </conditional> | 443 </conditional> |
| 310 <param name="include" value="seq-report,genome"/> | 444 <section name="file_choices"> |
| 311 <param name="uncompressed" value="true"/> | 445 <param name="include" value="seq-report,genome"/> |
| 446 <param name="decompress" value="true"/> | |
| 447 </section> | |
| 448 <output_collection name="sequence_report" type="list" count="2"/> | |
| 312 <output_collection name="genome_fasta" type="list:list" count="2"> | 449 <output_collection name="genome_fasta" type="list:list" count="2"> |
| 313 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> | 450 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> |
| 314 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> | 451 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> |
| 315 </output_collection> | 452 </output_collection> |
| 316 </test> | 453 </test> |
| 318 https://github.com/ncbi/datasets/issues/187 | 455 https://github.com/ncbi/datasets/issues/187 |
| 319 hence we set expect_test_failure="true"--> | 456 hence we set expect_test_failure="true"--> |
| 320 <test expect_num_outputs="1" expect_test_failure="true"> | 457 <test expect_num_outputs="1" expect_test_failure="true"> |
| 321 <conditional name="query|subcommand"> | 458 <conditional name="query|subcommand"> |
| 322 <param name="download_by" value="taxon"/> | 459 <param name="download_by" value="taxon"/> |
| 323 <param name="text_or_file" value="text"/> | 460 <param name="taxon_positional" value="4932"/> |
| 324 <param name="taxon" value="4932"/> | |
| 325 <param name="tax_exact_match" value="true"/> | 461 <param name="tax_exact_match" value="true"/> |
| 326 </conditional> | 462 </conditional> |
| 327 <param name="include" value=""/> | 463 <param name="include" value=""/> |
| 328 <param name="uncompressed" value="true"/> | |
| 329 <output name="genome_data_report"> | 464 <output name="genome_data_report"> |
| 330 <assert_contents> | 465 <assert_contents> |
| 331 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> | 466 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> |
| 332 </assert_contents> | 467 </assert_contents> |
| 333 </output> | 468 </output> |
| 336 <help> | 471 <help> |
| 337 <![CDATA[ | 472 <![CDATA[ |
| 338 **Download Genome Datasets from NCBI** | 473 **Download Genome Datasets from NCBI** |
| 339 | 474 |
| 340 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. | 475 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. |
| 341 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. | 476 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. |
| 342 | 477 |
| 343 Tthe default genome dataset includes the following files (if available): | 478 The download is a three step process: |
| 344 * data_report.jsonl (genome assembly and annotation metadata, not always available) | 479 |
| 345 * genomic.fna (genomic sequences) | 480 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) |
| 346 * rna.fna (transcript sequences) | 481 2. The metadata is transformed into a tabular (TSV) file |
| 347 * protein.faa (protein sequences) | 482 3. The data is hydrated (the actual data is downloaded) |
| 348 * genomic.gff (genome annotation in gff3 format) | 483 |
| 349 * dataset_catalog.json (a list of files and file types included in the dataset) | 484 The 3rd step can be skipped by unselecting all output types in the `Include` parameter. |
| 485 Thereby its possible to inspect the metadata prior to the actual data download. Also this | |
| 486 allows to use the tool for querying data sets (and their accessions) of interest which | |
| 487 can then be downloaded in a second call using the accessions. | |
| 350 ]]> | 488 ]]> |
| 351 </help> | 489 </help> |
| 352 | 490 <expand macro="citations"/> |
| 353 </tool> | 491 </tool> |
