diff datasets_gene.xml @ 0:c6009f4d7261 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit fd91cf3000d556d8219426eddb8a3059071a2009"
author iuc
date Thu, 15 Jul 2021 13:31:56 +0000
parents
children 48e0814f250a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets_gene.xml	Thu Jul 15 13:31:56 2021 +0000
@@ -0,0 +1,206 @@
+<tool id="datasets-download-gene" name="NCBI datasets download gene" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
+    <description>Download genes from NCBI</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"></expand>
+    <command><![CDATA[
+@SETUP_CERTIFICATES@
+datasets download gene $subcommand.download_by
+#if $subcommand.download_by != 'taxon':
+    #if $subcommand.text_or_file.text_or_file == 'text':
+        #if $subcommand.download_by == 'gene-id':
+            $subcommand.text_or_file.accession
+        #else if $subcommand.download_by == 'taxon':
+            '$subcommand.taxon'
+        #else
+            #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x)
+        #end if
+        #if $subcommand.download_by == 'accession' and $subcommand.text_or_file.taxon_filter:
+            --taxon-filter '$subcommand.taxon_filter'
+        #end if
+    #else
+        --inputfile '$subcommand.text_or_file.inputfile'
+    #end if
+#else:
+    '$subcommand.taxon'
+#end if
+@EXCLUDES_GENE@
+#if $subcommand.download_by == 'accession' and $subcommand.include_flanks_bp:
+    --include-flanks-bp $subcommand.include_flanks_bp
+#end if
+&& 7z x ncbi_dataset.zip
+]]></command>
+    <inputs>
+        <conditional name="subcommand">
+            <param name="download_by" type="select" label="Choose how to find genomes to download">
+                <option value="gene-id">Download a gene dataset by NCBI Gene ID</option>
+                <option value="symbol">Download a gene dataset by gene symbol</option>
+                <option value="accession">Download a gene dataset by RefSeq nucleotide or protein accession</option>
+                <option value="taxon">Download a gene dataset by taxon</option>
+            </param>
+            <when value="gene-id">
+                <expand macro="text_or_file" what="gene-id" what_extended="NCBI Gene ID" help="Should be valid NCBI Gene ID">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.digits">
+                            <add value=" " />
+                        </valid>
+                    </sanitizer>
+                </expand>
+            </when>
+            <when value="symbol">
+                <expand macro="text_or_file" what="symbol" what_extended="gene symbol" help="Should be valid gene symbol"/>
+                <param argument="--taxon" type="text" value="human" label="Specify a species name" help="Species name can be common or scientific name or species-level NCBI Taxonomy ID"/>
+            </when>
+            <when value="accession">
+                <expand macro="text_or_file" what="accession" what_extended="RefSeq nucleotide or protein accession" help="Should be RefSeq nucleotide or protein accession"/>
+                <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Include gene flanking sequence, limited to prokaryotic genes" help="If not specified flanking gene sequences will not be downloaded. Accession must start with WP"/>
+                <param argument="--taxon-filter" type="text" optional="true" label="limit genes to a specified taxon" help="any rank"/>
+            </when>
+            <when value="taxon">
+                <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param>
+            </when>
+        </conditional>
+        <expand macro="excludes_gene"></expand>
+        <conditional name="limit_fasta" label="Limit fasta by accession?">
+            <param name="limit" type="select" label="Select limit method">
+                <option value="none">None</option>
+                <option value="text">Enter list of accessions</option>
+                <option value="file">Read list of accessions from file</option>
+            </param>
+            <when value="none">
+            </when>
+            <when value="text">
+                <param argument="--fasta-filter" type="text" label="Limit gene fasta download to these accessions"/>
+            </when>
+            <when value="file">
+                <param argument="--fasta-filter-file" type="data" format="txt" label="File of accessions to limit gene fasta download"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="gene_fasta" format="fasta" label="NCBI datasets gene: gene fasta" from_work_dir="ncbi_dataset/data/gene.fna">
+            <filter>not exclude_gene</filter>
+        </data>
+        <data name="protein_fasta" format="fasta" label="NCBI datasets gene: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa">
+            <filter>not exclude_protein</filter>
+        </data>
+        <data name="rna_fasta" format="fasta" label="NCBI datasets gene: rna fasta" from_work_dir="ncbi_dataset/data/rna.fna">
+            <filter>not exclude_rna</filter>
+        </data>
+        <data name="gene_flanks" format="fasta" label="NCBI datasets gene: flanking sequence fasta" from_work_dir="ncbi_dataset/data/gene_flank.fna">
+            <filter><![CDATA[subcommand['include_flanks_bp']]]></filter>
+        </data>
+    </outputs>
+    <tests>
+        <test title="test download by gene-id">
+            <conditional name="subcommand">
+                <param name="download_by" value="gene-id"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="472 672"></param>
+                </conditional>
+            </conditional>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
+                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test title="test download by gene-id, test sanitizer">
+            <conditional name="subcommand">
+                <param name="download_by" value="gene-id"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="472 672"></param>
+                </conditional>
+            </conditional>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
+                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <not_has_text text="exit"/>
+            </assert_command>
+        </test>
+        <test title="test download by gene symbol">
+            <conditional name="subcommand">
+                <param name="download_by" value="symbol"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="BRCA1 ATM"></param>
+                </conditional>
+            </conditional>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
+                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test title="test download by accession">
+            <conditional name="subcommand">
+                <param name="download_by" value="accession"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="NM_000546.6 NM_000492.4"></param>
+                </conditional>
+            </conditional>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_line line="GTAGTAGGTCTTTGGCATTAGGAGCTTGAGCCCAGACGGCCCTAGCAGGGACCCCAGCGCCCGAGAGACC"/>
+                    <has_line line="CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGC"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <has_text text="'NM_000546.6' 'NM_000492.4'"/>
+            </assert_command>
+        </test>
+        <test title="test download by accession with flanking sequence">
+            <conditional name="subcommand">
+                <param name="download_by" value="accession"></param>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"></param>
+                    <param name="accession" value="WP_004675351.1"></param>
+                </conditional>
+                <param name="include_flanks_bp" value="10"/>
+            </conditional>
+            <output name="gene_flanks">
+                <assert_contents>
+                    <has_line line="gccctgccgcATGATCGATCTGATGCCGACGAGCGAGGAACAGGCGGCGGCGATCGTCCGCACCCATGCG"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <has_text text="--include-flanks-bp 10"/>
+            </assert_command>
+        </test>
+        <test title="test download by taxon">
+            <conditional name="subcommand">
+                <param name="download_by" value="taxon"></param>
+                <param name="taxon" value="Mycobacterium tuberculosis H37Rv"></param>
+            </conditional>
+            <param name="exclude_rna" value="true"/>
+            <param name="exclude_protein" value="true"/>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_line line="GTGGCGCTGAATATCAAAGACCCTGAGGTAGACCGACTAGCCGCCGAACTCGCTGACCGGCTGCACACCA"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+Download a gene dataset including gene, transcript and protein sequence, a data table and a data report. Gene datasets can be specified by NCBI Gene ID, symbol or RefSeq accession. Datasets are downloaded as a zip file.
+
+The default gene dataset includes the following files:
+ * gene.fna (gene sequences)
+ * rna.fna (transcript sequences)
+ * protein.faa (protein sequences)
+ * data_report.jsonl (data report with gene metadata)
+ * data_table.tsv (data table with gene metadata, one transcript per row)
+ * dataset_catalog.json (a list of files and file types included in the dataset)
+    </help>
+
+</tool>