Mercurial > repos > iuc > ncbi_datasets
comparison datasets_genome.xml @ 21:7cd911289a7f draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5563c48ba342dbd3ef5dedf7254bfd5cb586ba65
| author | iuc |
|---|---|
| date | Wed, 14 Jan 2026 15:04:48 +0000 |
| parents | 35d32c807c23 |
| children |
comparison
equal
deleted
inserted
replaced
| 20:35d32c807c23 | 21:7cd911289a7f |
|---|---|
| 63 && unzip ncbi_dataset.zip | 63 && unzip ncbi_dataset.zip |
| 64 | 64 |
| 65 ## rehydrate | 65 ## rehydrate |
| 66 && datasets rehydrate | 66 && datasets rehydrate |
| 67 --directory ./ | 67 --directory ./ |
| 68 #if not $file_choices.decompress | 68 --gzip |
| 69 --gzip | |
| 70 #end if | |
| 71 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} | 69 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} |
| 72 | 70 |
| 73 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery | 71 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery |
| 74 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; | 72 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; |
| 75 | 73 |
| 76 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) | 74 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) |
| 77 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip) | 75 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called with --gzip) |
| 78 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression | 76 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression |
| 79 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; | 77 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; |
| 80 #if not $file_choices.decompress | 78 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; |
| 81 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; | 79 #if $file_choices.decompress |
| 80 && find ncbi_dataset -name "*fasta.gz" -exec gunzip {} \; | |
| 82 #end if | 81 #end if |
| 83 | 82 |
| 84 #if "seq-report" in $file_choices.include | 83 #if "seq-report" in $file_choices.include |
| 85 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; | 84 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; |
| 86 #end if | 85 #end if |
| 172 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> | 171 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> |
| 173 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> | 172 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> |
| 174 </collection> | 173 </collection> |
| 175 </outputs> | 174 </outputs> |
| 176 <tests> | 175 <tests> |
| 176 <!-- download sequence and non-sequence data to test if unzipping works | |
| 177 sequence should be downloaded as gz and non-sequence unzipped | |
| 178 | |
| 179 restrict download size for testing by using release data filtering | |
| 180 --> | |
| 177 <test expect_num_outputs="3"> | 181 <test expect_num_outputs="3"> |
| 178 <conditional name="query|subcommand"> | 182 <conditional name="query|subcommand"> |
| 179 <param name="download_by" value="taxon"/> | 183 <param name="download_by" value="taxon"/> |
| 180 <param name="taxon_positional" value="human"/> | 184 <param name="taxon_positional" value="human"/> |
| 181 </conditional> | 185 </conditional> |
| 182 <section name="filters"> | 186 <section name="filters"> |
| 183 <param name="chromosomes" value="21"/> | 187 <param name="released_after" value="08/31/2004"/> |
| 184 <param name="released_before" value="01/01/2018"/> | 188 <param name="released_before" value="01/01/2005"/> |
| 185 </section> | 189 </section> |
| 186 <section name="file_choices"> | 190 <section name="file_choices"> |
| 187 <!-- include a sequence (which should be downloaded as fasta.gz) | |
| 188 and one non-sequence (which should be decompressed) output --> | |
| 189 <param name="include" value="rna,gff3"/> | 191 <param name="include" value="rna,gff3"/> |
| 190 </section> | 192 </section> |
| 191 <output name="genome_data_report"> | 193 <output name="genome_data_report"> |
| 192 <assert_contents> | 194 <assert_contents> |
| 193 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> | 195 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> |
| 194 <has_n_lines min="140"/> | 196 <!-- no idea why the report contains 2 entries, but only one is downloaded |
| 197 https://github.com/ncbi/datasets/issues/553 --> | |
| 198 <has_n_lines n="3"/> | |
| 195 <has_n_columns n="4"/> | 199 <has_n_columns n="4"/> |
| 196 </assert_contents> | 200 </assert_contents> |
| 197 </output> | 201 </output> |
| 198 <output_collection name="rna_fasta" type="list"> | 202 <output_collection name="rna_fasta" type="list" count="1"> |
| 199 <element name="GCF_000306695.2" decompress="true"> | 203 <element name="GCF_000002135.2" decompress="true" ftype="fasta.gz"> |
| 200 <assert_contents> | 204 <assert_contents> |
| 201 <has_text text=">"/> | 205 <has_text text=">"/> |
| 202 </assert_contents> | 206 </assert_contents> |
| 203 </element> | 207 </element> |
| 204 </output_collection> | 208 </output_collection> |
| 205 <output_collection name="genomic_gff" type="list"> | 209 <output_collection name="genomic_gff" type="list" count="1"> |
| 206 <element name="GCF_000306695.2"> | 210 <element name="GCF_000002135.2" ftype="gff3"> |
| 207 <assert_contents> | 211 <assert_contents> |
| 208 <has_n_lines min="1000000"/> | 212 <has_n_lines min="40000"/> |
| 209 <has_line line="##gff-version 3"/> | 213 <has_line line="##gff-version 3"/> |
| 210 <has_n_columns n="9" comment="#"/> | 214 <has_n_columns n="9" comment="#"/> |
| 211 </assert_contents> | 215 </assert_contents> |
| 212 </element> | 216 </element> |
| 213 </output_collection> | 217 </output_collection> |
| 483 <conditional name="query|subcommand"> | 487 <conditional name="query|subcommand"> |
| 484 <param name="download_by" value="taxon"/> | 488 <param name="download_by" value="taxon"/> |
| 485 <param name="taxon_positional" value="4932"/> | 489 <param name="taxon_positional" value="4932"/> |
| 486 <param name="tax_exact_match" value="true"/> | 490 <param name="tax_exact_match" value="true"/> |
| 487 </conditional> | 491 </conditional> |
| 488 <output name="genome_data_report"> | 492 <section name="filters"> |
| 489 <assert_contents> | 493 <param name="released_before" value="11/01/2012"/> |
| 494 </section> | |
| 495 <section name="file_choices"> | |
| 496 <param name="include" value="seq-report"/> | |
| 497 <param name="decompress" value="true"/> | |
| 498 </section> | |
| 499 <output name="genome_data_report"> | |
| 500 <assert_contents> | |
| 501 <has_n_lines n="2"/> | |
| 490 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> | 502 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> |
| 491 </assert_contents> | 503 </assert_contents> |
| 492 </output> | 504 </output> |
| 505 | |
| 493 </test> | 506 </test> |
| 494 <!-- test search filter --> | 507 <!-- test search filter --> |
| 495 <test expect_num_outputs="1"> | 508 <test expect_num_outputs="1"> |
| 496 <conditional name="query|subcommand"> | 509 <conditional name="query|subcommand"> |
| 497 <param name="download_by" value="taxon"/> | 510 <param name="download_by" value="taxon"/> |
| 498 <param name="taxon_positional" value="Streptococcus"/> | 511 <param name="taxon_positional" value="Streptococcus"/> |
| 499 </conditional> | 512 </conditional> |
| 513 <section name="filters"> | |
| 514 <param name="released_before" value="01/01/2010"/> | |
| 515 </section> | |
| 500 <section name="filters"> | 516 <section name="filters"> |
| 501 <repeat name="search"> | 517 <repeat name="search"> |
| 502 <param name="search" value="pyogenes"/> | 518 <param name="search" value="pyogenes"/> |
| 503 </repeat> | 519 </repeat> |
| 504 </section> | 520 </section> |
| 505 <section name="file_choices"> | 521 <section name="file_choices"> |
| 506 <param name="include" value_json="null"/> | 522 <param name="include" value_json="null"/> |
| 507 </section> | 523 </section> |
| 508 <output name="genome_data_report"> | 524 <output name="genome_data_report"> |
| 509 <assert_contents> | 525 <assert_contents> |
| 510 <has_text text="pyogenes"/> | 526 <has_n_lines n="21"/> |
| 527 <has_text text="pyogenes" n="20"/> | |
| 511 </assert_contents> | 528 </assert_contents> |
| 512 </output> | 529 </output> |
| 513 </test> | 530 </test> |
| 514 </tests> | 531 </tests> |
| 515 <help><