diff datasets_gene.xml @ 20:35d32c807c23 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
author iuc
date Fri, 26 Dec 2025 17:16:51 +0000
parents 9a10a6449901
children
line wrap: on
line diff
--- a/datasets_gene.xml	Mon Mar 17 11:05:34 2025 +0000
+++ b/datasets_gene.xml	Fri Dec 26 17:16:51 2025 +0000
@@ -4,7 +4,7 @@
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"></expand>
+    <expand macro="requirements"/>
     <expand macro="version_command"/>
     <command><![CDATA[
 #import re
@@ -41,7 +41,7 @@
 
 #if $filters.fasta_filter_cond.fasta_filter_select
     #if $filters.fasta_filter_cond.fasta_filter_select == 'text'
-        --fasta-filter #echo ",".join(f"'{x}'" for x in $filters.fasta_filter_cond.fasta_filter.split(',') if x)
+        --fasta-filter #echo ",".join(f"'{x}'" for x in str($filters.fasta_filter_cond.fasta_filter).split(',') if x)
     #else
         --fasta-filter-file '$filters.fasta_filter_cond.fasta_filter_file'
     #end if
@@ -97,8 +97,8 @@
                     <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters">
-                                <add value=" " />
-                                <add value="-" />
+                                <add value=" "/>
+                                <add value="-"/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -109,8 +109,8 @@
                     <param argument="--taxon-filter" type="text" value="" label="Limit gene sequences and annotation report file to specified taxon" help="any rank, only available for WP accessions">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters">
-                                <add value=" " />
-                                <add value="-" />
+                                <add value=" "/>
+                                <add value="-"/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -133,7 +133,7 @@
                     <param argument="--fasta-filter" type="text" label="RefSeq nucleotide and protein accessions" help="Comma separated">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters,string.digits">
-                                <add value="," />
+                                <add value=","/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -209,7 +209,7 @@
             <filter>file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']</filter>
         </data>
         <data name="threep_utr_fasta" label="NCBI Gene Datasets: 3' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/3p_utr.fna">
-            <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
+            <filter>file_choices['kingdom_cond']['include'] and "3p-utr" in file_choices['kingdom_cond']['include']</filter>
         </data>
         <data name="fivep_utr_fasta" label="NCBI Gene Datasets: 5' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/5p_utr.fna">
             <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
@@ -235,12 +235,12 @@
             </output>
             <output name="rna_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
@@ -263,12 +263,12 @@
             </output>
             <output name="rna_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
@@ -284,6 +284,7 @@
             </conditional>
             <section name="file_choices">
                 <conditional name="kingdom_cond">
+                    <param name="kingdom_sel" value="gene"/>
                     <param name="include" value="gene,cds"/>
                 </conditional>
             </section>
@@ -297,17 +298,17 @@
             </output>
             <output name="gene_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="cds_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
         <!-- 4: datasets download gene symbol tp53 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="symbol"/>
                 <conditional name="text_or_file">
@@ -315,11 +316,6 @@
                     <param name="accession" value="tp53"/>
                 </conditional>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -361,17 +357,17 @@
             </output>
             <output name="threep_utr_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="fivep_utr_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
         <!-- 6: datasets download gene symbol brca1 \-\-ortholog -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="symbol"/>
                 <conditional name="text_or_file">
@@ -380,11 +376,6 @@
                 </conditional>
                 <param name="ortholog" value="rodentia"/>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="rat"/>
@@ -395,7 +386,7 @@
             </output>
         </test>
         <!-- 7: datasets download gene accession NP_000483.3 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -403,11 +394,6 @@
                     <param name="accession" value="NP_000483.3"/>
                 </conditional>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -417,7 +403,7 @@
             </output>
         </test>
         <!-- 8: datasets download gene accession NM_000546.6 NM_000492.4 + ortholog-->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -426,11 +412,6 @@
                 </conditional>
                 <param name="ortholog" value="all"/>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -439,7 +420,6 @@
                 </assert_contents>
             </output>
         </test>
-
         <!-- 9: datasets download gene accession WP_003249567.1 + include_flanks_bp -->
         <test expect_num_outputs="4">
             <conditional name="query|subcommand">
@@ -466,24 +446,23 @@
             </output>
             <output name="gene_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="gene_flanks">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <assert_command>
                 <has_text text="include-flanks-bp 100"/>
             </assert_command>
-        </test> 
-
+        </test>
         <!-- 10: datasets download gene taxon human   -->
         <!-- <test expect_num_outputs="1">
             <conditional name="query|subcommand">
@@ -534,15 +513,60 @@
             </output>
         </test> -->
     </tests>
-    <help>
-<![CDATA[
-**Download Gene Datasets from NCBI**
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Downloads gene data from NCBI using the `datasets`_ command-line tool.
+Retrieve gene sequences, transcripts, proteins, and annotation reports.
+
+**Query Options**
+
+=============  ================================================================
+Method         Description
+=============  ================================================================
+Gene ID        NCBI Gene ID (e.g., 672 for BRCA1)
+Symbol         Gene symbol with taxon (e.g., TP53 in human)
+Accession      RefSeq nucleotide (NM\_) or protein (NP\_/WP\_) accession
+Taxon          All genes for a taxon (large downloads)
+=============  ================================================================
+
+----
+
+**Key Options**
+
+- **Ortholog retrieval**: Get orthologous genes across taxa (vertebrates/insects)
+- **Taxon filter**: Limit WP\_ accession results to specific organisms
+- **Flanking sequence**: Include nucleotides upstream/downstream (WP\_ only)
+- **FASTA filter**: Subset output to specific accessions
 
-Download a gene dataset (gene sequence, transcipt, amino acid sequences, 
-nucleotide coding sequences, 5'-UTR, 3'-UTR) as well as gene and gene
-product reports. Genes can be referred by gene id, symbol, accession,
-or taxon.
-]]>
-    </help>
+**Outputs (Eukaryote)**
+
+- **Gene Data Report**: Tabular metadata (ID, symbol, description, coordinates)
+- **Gene Product Report**: Detailed transcript/protein information
+- **Sequences**: Gene, RNA, protein, CDS, 5'/3' UTR FASTA files
+
+**Outputs (Prokaryote)**
+
+Prokaryotic genes (WP\_ accessions) use a different report format with:
+accession, description, EC number, gene symbol, protein info.
+
+**Examples**
+
+Download human BRCA1::
+
+    Query by: Gene ID
+    Gene ID: 672
+
+Download TP53 orthologs in rodents::
+
+    Query by: Symbol
+    Symbol: tp53
+    Ortholog: rodentia
+
+
+.. _datasets: https://www.ncbi.nlm.nih.gov/datasets/
+]]></help>
     <expand macro="citations"/>
 </tool>