comparison fasterq_dump.xml @ 15:54366e4d692a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit 2be63abd7e4db27fa32ecbbc10d48cacb0073115"
author iuc
date Fri, 20 Mar 2020 16:36:52 +0000
parents
children f61761f373ea
comparison
equal deleted inserted replaced
14:5c5d6624f8de 15:54366e4d692a
1 <tool id="fasterq_dump" name="Faster Download and Extract Reads in FASTQ" version="@VERSION@+galaxy1" profile="18.01">
2 <description>format from NCBI SRA</description>
3 <macros>
4 <import>sra_macros.xml</import>
5 </macros>
6 <expand macro="requirements"/>
7 <version_command>fasterq-dump --version</version_command>
8 <command detect_errors="exit_code"><![CDATA[
9 @SET_ACCESSIONS@
10 #if $input.input_select == "file":
11 acc='${input.file.name}' &&
12 ln -s '${input.file}' "\$acc" &&
13 #end if
14 @CONFIGURE_TIMEOUT@
15 fasterq-dump "\$acc" -e \${GALAXY_SLOTS:-1}
16 $adv.split
17 #if str( $adv.minlen ) != "":
18 --min-read-len "$adv.minlen"
19 #end if
20 $adv.skip_technical >> $log 2>&1
21 &&
22 mkdir -p output &&
23 mkdir -p outputOther &&
24 count=`ls *.fastq | wc -l` &&
25 echo "There are \$count fastq" &&
26 data=(\$(ls *.fastq)) &&
27 if [ "\$count" -eq 1 ]; then
28 gzip -c "\${data[0]}" > output/"\${acc}"__single.fastqsanger.gz &&
29 rm "\${data[0]}";
30 elif [ "$adv.split" = "--split-3" ]; then
31 if [ -e "\${acc}".fastq ]; then
32 gzip -c "\${acc}".fastq > outputOther/"\${acc}"__single.fastqsanger.gz;
33 fi &&
34 gzip -c "\${acc}"_1.fastq > output/"\${acc}"_forward.fastqsanger.gz &&
35 gzip -c "\${acc}"_2.fastq > output/"\${acc}"_reverse.fastqsanger.gz &&
36 rm "\${acc}"*.fastq;
37 elif [ "\$count" -eq 2 ]; then
38 #if $adv.skip_technical:
39 gzip -c "\${data[0]}" > output/"\${acc}"_forward.fastqsanger.gz &&
40 gzip -c "\${data[1]}" > output/"\${acc}"_reverse.fastqsanger.gz &&
41 #else
42 gzip -c "\${data[0]}" > outputOther/"\${data[0]}"sanger.gz &&
43 gzip -c "\${data[1]}" > outputOther/"\${data[1]}"sanger.gz &&
44 #end if
45 rm "\${data[0]}" &&
46 rm "\${data[1]}";
47 else
48 for file in \${data[*]}; do
49 gzip -c "\$file" > outputOther/"\$file"sanger.gz &&
50 rm "\$file";
51 done;
52 fi;
53 #if $input.input_select=="file_list":
54 ) ; done
55
56 ;
57 #elif $input.input_select=="accession_number":
58 );
59 #end if
60 ]]>
61 </command>
62 <inputs>
63 <expand macro="input_conditional"/>
64 <section name="adv" title="Advanced Options" expanded="False">
65 <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--min-read-len"/>
66 <param name="split" type="select" display="radio" label="Select how to split the spots" help="This option will only be used when there are multiple reads per spot (for example paired-end).">
67 <option value="--split-3">--split-3: write properly paired biological reads into different files and single reads in another file</option>
68 <option value="--split-files">--split-files: write reads into different files (forward and reverse may not match if one read is empty)</option>
69 <option value="--split-spot">--split-spot: split spots into reads (only one output file)</option>
70 <option value="--concatenate-reads">--concatenate-reads: writes whole spots into one file</option>
71 </param>
72 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="--include-technical" checked="True" label="Dump only biological reads" help="Will not be used if --split-3 is selected." argument="--skip-technical/--include-technical"/>
73 </section>
74 </inputs>
75 <outputs>
76 <data name="log" format="txt" label="fasterq-dump log"/>
77 <collection name="list_paired" type="list:paired" label="Pair-end data (fasterq-dump)">
78
79 <!-- Use named regex group to grab pattern
80 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
81 identifier in the nested collection and identifier_1 is either
82 forward or reverse (for instance samp1_forward.fq).
83 -->
84
85 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" directory="output" ext="fastqsanger.gz" />
86 </collection>
87 <collection name="output_collection" type='list' label="Single-end data (fasterq-dump)">
88 <discover_datasets pattern="(?P&lt;designation&gt;.+)__single\.fastqsanger.gz" directory="output" ext='fastqsanger.gz'/>
89 </collection>
90 <collection name="output_collection_other" type='list' label="Other data (fasterq-dump)">
91 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fastqsanger\.gz" directory="outputOther" format="fastqsanger.gz"/>
92 </collection>
93 </outputs>
94 <tests>
95 <test>
96 <param name="input_select" value="accession_number"/>
97 <param name="accession" value="ERR086330"/>
98 <output_collection name="list_paired" type="list:paired">
99 <element name="ERR086330">
100 <element name="forward" file="ERR086330_1.fastq.gz" decompress="True">
101 </element>
102 <element name="reverse" file="ERR086330_2.fastq.gz" decompress="True">
103 </element>
104 </element>
105 </output_collection>
106 </test>
107 <test>
108 <param name="input_select" value="accession_number"/>
109 <param name="accession" value="SRR002702"/>
110 <param name="split" value="--split-files"/>
111 <param name="skip_technical" value="False"/>
112 <output_collection name="output_collection_other" type="list">
113 <element name="SRR002702_1" file="SRR002702_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
114 <element name="SRR002702_2" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
115 </output_collection>
116 </test>
117 <test>
118 <param name="input_select" value="file"/>
119 <param name="file" value="SRR522874.sra"/>
120 <param name="split" value="--split-files"/>
121 <param name="skip_technical" value="True"/>
122 <output_collection name="list_paired" type="list:paired">
123 <element name="SRR522874.sra">
124 <element name="forward" file="SRR522874.sra_2.fastq.gz" decompress="True">
125 </element>
126 <element name="reverse" file="SRR522874.sra_4.fastq.gz" decompress="True">
127 </element>
128 </element>
129 </output_collection>
130 </test>
131 <test>
132 <param name="input_select" value="file"/>
133 <param name="file" value="SRR522874.sra"/>
134 <param name="split" value="--split-files"/>
135 <param name="skip_technical" value="False"/>
136 <output_collection name="output_collection_other" type="list">
137 <element name="SRR522874.sra_1" file="SRR522874.sra_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
138 <element name="SRR522874.sra_2" file="SRR522874.sra_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
139 <element name="SRR522874.sra_3" file="SRR522874.sra_3.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
140 <element name="SRR522874.sra_4" file="SRR522874.sra_4.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
141 </output_collection>
142 </test>
143 <test>
144 <param name="input_select" value="file_list"/>
145 <param name="file_list" value="list_sra"/>
146 <param name="minlen" value="21"/>
147 <output_collection name="output_collection_other" type="list">
148 <element name="SRR522874__single" file="SRR522874.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
149 </output_collection>
150 <output_collection name="list_paired" type="list:paired">
151 <element name="SRR522874">
152 <element name="forward" file="SRR522874_1.fastq.gz" decompress="True">
153 </element>
154 <element name="reverse" file="SRR522874_2.fastq.gz" decompress="True">
155 </element>
156 </element>
157 </output_collection>
158 <output_collection name="output_collection" type="list">
159 <element name="SRR002702" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
160 </output_collection>
161 </test>
162 </tests>
163 <help><![CDATA[
164 **What it does?**
165
166 This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fasterq-dump_ utility of the SRA Toolkit.
167
168 **How to use it?**
169
170 There are three ways in which you can download data:
171
172 1. Data for single accession
173 2. Multiple datasets using a list of accessions
174 3. Extract data from already uploaded SRA dataset
175
176 Below we discuss each in detail.
177
178 ------
179
180 **Uploading data for a single accession**
181
182 When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you.
183
184 -----
185
186 **Uploading multiple datasets using a list of accessions**
187
188 A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file:
189
190 1. Upload it into your history using Galaxy's upload tool
191 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown
192 3. Choose uploaded file within the **sra accession list** field
193 4. Click **Execute**
194
195 -----
196
197 **Extract data from already uploaded SRA dataset**
198
199 If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies:
200
201 - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below).
202 - if data is single ended, a standard fastq dataset will be produced
203
204 -----
205
206 **Output**
207
208 In every case, fastq datasets produced will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets.
209 In fact, three collections will be produced: one containing paired-end data, another containing single-end data, and a third one which contains reads which could not be classified.
210 Some collections may be empty if the accessions provided in the list does not contain one of the type of data.
211
212 .. class:: warningmark
213
214 When you decide to dump technical reads (in Advanced Options Dump only biological reads is set to No), you will probably find your PAIRED data in the other data collection as it is impossible to determine if it was 2 biological reads or one biological and one technical.
215
216 .. class:: warningmark
217
218 By default, only biological reads are dumped and in case of PAIRED dataset only the spots which have both reads will be in the paired-end collection. The remaining single reads will be in the other colletion.
219 To keep all reads, and maybe do not have the same number of reads in forward and reverse use the --split-files option in Advanced Options, Select how to split the spots.
220
221 @ACCESSION_LIST_HOWTO@
222
223 -----
224
225
226 .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
227 .. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html
228 .. _fasterq-dump: https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump
229 .. _collection: https://galaxyproject.org/tutorials/collections/
230 .. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies
231
232 @SRATOOLS_ATTRRIBUTION@
233
234 ]]>
235 </help>
236 <expand macro="citation"/>
237 </tool>