ncbi_datasets: datasets_genome.xml comparison

comparison datasets_genome.xml @ 21:7cd911289a7f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5563c48ba342dbd3ef5dedf7254bfd5cb586ba65

author	iuc
date	Wed, 14 Jan 2026 15:04:48 +0000
parents	35d32c807c23
children

comparison

equal deleted inserted replaced

-:35d32c807c23
+:7cd911289a7f
 && unzip ncbi_dataset.zip
 ## rehydrate
 && datasets rehydrate
 --directory ./
-#if not $file_choices.decompress
+--gzip
---gzip
-#end if
 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
-## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
+## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called with --gzip)
 ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
-#if not $file_choices.decompress
+&& find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
-&& find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+#if $file_choices.decompress
+&& find ncbi_dataset -name "*fasta.gz" -exec gunzip {} \;
 #end if
 #if "seq-report" in $file_choices.include
 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
 #end if
 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
 </collection>
 </outputs>
 <tests>
+<!-- download sequence and non-sequence data to test if unzipping works
+sequence should be downloaded as gz and non-sequence unzipped
+restrict download size for testing by using release data filtering
+-->
 <test expect_num_outputs="3">
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
 <param name="taxon_positional" value="human"/>
 </conditional>
 <section name="filters">
-<param name="chromosomes" value="21"/>
+<param name="released_after" value="08/31/2004"/>
-<param name="released_before" value="01/01/2018"/>
+<param name="released_before" value="01/01/2005"/>
 </section>
 <section name="file_choices">
-<!-- include a sequence (which should be downloaded as fasta.gz)
-and one non-sequence (which should be decompressed) output -->
 <param name="include" value="rna,gff3"/>
 </section>
 <output name="genome_data_report">
 <assert_contents>
 <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
-<has_n_lines min="140"/>
+<!-- no idea why the report contains 2 entries, but only one is downloaded
+https://github.com/ncbi/datasets/issues/553 -->
+<has_n_lines n="3"/>
 <has_n_columns n="4"/>
 </assert_contents>
 </output>
-<output_collection name="rna_fasta" type="list">
+<output_collection name="rna_fasta" type="list" count="1">
-<element name="GCF_000306695.2" decompress="true">
+<element name="GCF_000002135.2" decompress="true" ftype="fasta.gz">
 <assert_contents>
 <has_text text="&gt;"/>
 </assert_contents>
 </element>
 </output_collection>
-<output_collection name="genomic_gff" type="list">
+<output_collection name="genomic_gff" type="list" count="1">
-<element name="GCF_000306695.2">
+<element name="GCF_000002135.2" ftype="gff3">
 <assert_contents>
-<has_n_lines min="1000000"/>
+<has_n_lines min="40000"/>
 <has_line line="##gff-version 3"/>
 <has_n_columns n="9" comment="#"/>
 </assert_contents>
 </element>
 </output_collection>
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
 <param name="taxon_positional" value="4932"/>
 <param name="tax_exact_match" value="true"/>
 </conditional>
-<output name="genome_data_report">
+<section name="filters">
-<assert_contents>
+<param name="released_before" value="11/01/2012"/>
+</section>
+<section name="file_choices">
+<param name="include" value="seq-report"/>
+<param name="decompress" value="true"/>
+</section>
+<output name="genome_data_report">
+<assert_contents>
+<has_n_lines n="2"/>
 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
 </assert_contents>
 </output>
 </test>
 <!-- test search filter -->
 <test expect_num_outputs="1">
 <conditional name="query|subcommand">
 <param name="download_by" value="taxon"/>
 <param name="taxon_positional" value="Streptococcus"/>
 </conditional>
+<section name="filters">
+<param name="released_before" value="01/01/2010"/>
+</section>
 <section name="filters">
 <repeat name="search">
 <param name="search" value="pyogenes"/>
 </repeat>
 </section>
 <section name="file_choices">
 <param name="include" value_json="null"/>
 </section>
 <output name="genome_data_report">
 <assert_contents>
-<has_text text="pyogenes"/>
+<has_n_lines n="21"/>
+<has_text text="pyogenes" n="20"/>
 </assert_contents>
 </output>
 </test>
 </tests>
 <help><![CDATA[

Mercurial > repos > iuc > ncbi_datasets

comparison datasets_genome.xml @ 21:7cd911289a7f draft default tip