comparison datasets_genome.xml @ 21:7cd911289a7f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5563c48ba342dbd3ef5dedf7254bfd5cb586ba65
author iuc
date Wed, 14 Jan 2026 15:04:48 +0000
parents 35d32c807c23
children
comparison
equal deleted inserted replaced
20:35d32c807c23 21:7cd911289a7f
63 && unzip ncbi_dataset.zip 63 && unzip ncbi_dataset.zip
64 64
65 ## rehydrate 65 ## rehydrate
66 && datasets rehydrate 66 && datasets rehydrate
67 --directory ./ 67 --directory ./
68 #if not $file_choices.decompress 68 --gzip
69 --gzip
70 #end if
71 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} 69 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
72 70
73 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery 71 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
74 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; 72 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
75 73
76 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) 74 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
77 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip) 75 ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called with --gzip)
78 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression 76 ## in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
79 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; 77 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
80 #if not $file_choices.decompress 78 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
81 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; 79 #if $file_choices.decompress
80 && find ncbi_dataset -name "*fasta.gz" -exec gunzip {} \;
82 #end if 81 #end if
83 82
84 #if "seq-report" in $file_choices.include 83 #if "seq-report" in $file_choices.include
85 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; 84 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
86 #end if 85 #end if
172 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/> 171 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
173 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> 172 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
174 </collection> 173 </collection>
175 </outputs> 174 </outputs>
176 <tests> 175 <tests>
176 <!-- download sequence and non-sequence data to test if unzipping works
177 sequence should be downloaded as gz and non-sequence unzipped
178
179 restrict download size for testing by using release data filtering
180 -->
177 <test expect_num_outputs="3"> 181 <test expect_num_outputs="3">
178 <conditional name="query|subcommand"> 182 <conditional name="query|subcommand">
179 <param name="download_by" value="taxon"/> 183 <param name="download_by" value="taxon"/>
180 <param name="taxon_positional" value="human"/> 184 <param name="taxon_positional" value="human"/>
181 </conditional> 185 </conditional>
182 <section name="filters"> 186 <section name="filters">
183 <param name="chromosomes" value="21"/> 187 <param name="released_after" value="08/31/2004"/>
184 <param name="released_before" value="01/01/2018"/> 188 <param name="released_before" value="01/01/2005"/>
185 </section> 189 </section>
186 <section name="file_choices"> 190 <section name="file_choices">
187 <!-- include a sequence (which should be downloaded as fasta.gz)
188 and one non-sequence (which should be decompressed) output -->
189 <param name="include" value="rna,gff3"/> 191 <param name="include" value="rna,gff3"/>
190 </section> 192 </section>
191 <output name="genome_data_report"> 193 <output name="genome_data_report">
192 <assert_contents> 194 <assert_contents>
193 <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/> 195 <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
194 <has_n_lines min="140"/> 196 <!-- no idea why the report contains 2 entries, but only one is downloaded
197 https://github.com/ncbi/datasets/issues/553 -->
198 <has_n_lines n="3"/>
195 <has_n_columns n="4"/> 199 <has_n_columns n="4"/>
196 </assert_contents> 200 </assert_contents>
197 </output> 201 </output>
198 <output_collection name="rna_fasta" type="list"> 202 <output_collection name="rna_fasta" type="list" count="1">
199 <element name="GCF_000306695.2" decompress="true"> 203 <element name="GCF_000002135.2" decompress="true" ftype="fasta.gz">
200 <assert_contents> 204 <assert_contents>
201 <has_text text="&gt;"/> 205 <has_text text="&gt;"/>
202 </assert_contents> 206 </assert_contents>
203 </element> 207 </element>
204 </output_collection> 208 </output_collection>
205 <output_collection name="genomic_gff" type="list"> 209 <output_collection name="genomic_gff" type="list" count="1">
206 <element name="GCF_000306695.2"> 210 <element name="GCF_000002135.2" ftype="gff3">
207 <assert_contents> 211 <assert_contents>
208 <has_n_lines min="1000000"/> 212 <has_n_lines min="40000"/>
209 <has_line line="##gff-version 3"/> 213 <has_line line="##gff-version 3"/>
210 <has_n_columns n="9" comment="#"/> 214 <has_n_columns n="9" comment="#"/>
211 </assert_contents> 215 </assert_contents>
212 </element> 216 </element>
213 </output_collection> 217 </output_collection>
483 <conditional name="query|subcommand"> 487 <conditional name="query|subcommand">
484 <param name="download_by" value="taxon"/> 488 <param name="download_by" value="taxon"/>
485 <param name="taxon_positional" value="4932"/> 489 <param name="taxon_positional" value="4932"/>
486 <param name="tax_exact_match" value="true"/> 490 <param name="tax_exact_match" value="true"/>
487 </conditional> 491 </conditional>
488 <output name="genome_data_report"> 492 <section name="filters">
489 <assert_contents> 493 <param name="released_before" value="11/01/2012"/>
494 </section>
495 <section name="file_choices">
496 <param name="include" value="seq-report"/>
497 <param name="decompress" value="true"/>
498 </section>
499 <output name="genome_data_report">
500 <assert_contents>
501 <has_n_lines n="2"/>
490 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> 502 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
491 </assert_contents> 503 </assert_contents>
492 </output> 504 </output>
505
493 </test> 506 </test>
494 <!-- test search filter --> 507 <!-- test search filter -->
495 <test expect_num_outputs="1"> 508 <test expect_num_outputs="1">
496 <conditional name="query|subcommand"> 509 <conditional name="query|subcommand">
497 <param name="download_by" value="taxon"/> 510 <param name="download_by" value="taxon"/>
498 <param name="taxon_positional" value="Streptococcus"/> 511 <param name="taxon_positional" value="Streptococcus"/>
499 </conditional> 512 </conditional>
513 <section name="filters">
514 <param name="released_before" value="01/01/2010"/>
515 </section>
500 <section name="filters"> 516 <section name="filters">
501 <repeat name="search"> 517 <repeat name="search">
502 <param name="search" value="pyogenes"/> 518 <param name="search" value="pyogenes"/>
503 </repeat> 519 </repeat>
504 </section> 520 </section>
505 <section name="file_choices"> 521 <section name="file_choices">
506 <param name="include" value_json="null"/> 522 <param name="include" value_json="null"/>
507 </section> 523 </section>
508 <output name="genome_data_report"> 524 <output name="genome_data_report">
509 <assert_contents> 525 <assert_contents>
510 <has_text text="pyogenes"/> 526 <has_n_lines n="21"/>
527 <has_text text="pyogenes" n="20"/>
511 </assert_contents> 528 </assert_contents>
512 </output> 529 </output>
513 </test> 530 </test>
514 </tests> 531 </tests>
515 <help><![CDATA[ 532 <help><![CDATA[