Mercurial > repos > peterjc > ncbi_blast_plus
comparison ncbi_blast_plus/ncbi_blastp_wrapper.xml @ 20:688f3fb09a6a draft
Uploaded v0.0.20 preview 11, moved to GitHub, MIT license, reST markup.
author | peterjc |
---|---|
date | Tue, 30 Jul 2013 07:33:46 -0400 |
parents | |
children | 61f402b6e240 |
comparison
equal
deleted
inserted
replaced
19:c1a6e5aefee0 | 20:688f3fb09a6a |
---|---|
1 <tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.20"> | |
2 <description>Search protein database with protein query sequence(s)</description> | |
3 <!-- If job splitting is enabled, break up the query file into parts --> | |
4 <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> | |
5 <requirements> | |
6 <requirement type="binary">blastp</requirement> | |
7 <requirement type="package" version="2.2.26+">blast+</requirement> | |
8 </requirements> | |
9 <version_command>blastp -version</version_command> | |
10 <command> | |
11 ## The command is a Cheetah template which allows some Python based syntax. | |
12 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces | |
13 blastp | |
14 -query "$query" | |
15 #if $db_opts.db_opts_selector == "db": | |
16 -db "${db_opts.database.fields.path}" | |
17 #elif $db_opts.db_opts_selector == "histdb": | |
18 -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" | |
19 #else: | |
20 -subject "$db_opts.subject" | |
21 #end if | |
22 -task $blast_type | |
23 -evalue $evalue_cutoff | |
24 -out "$output1" | |
25 ##Set the extended list here so if/when we add things, saved workflows are not affected | |
26 #if str($out_format)=="ext": | |
27 -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" | |
28 #else: | |
29 -outfmt $out_format | |
30 #end if | |
31 -num_threads 8 | |
32 #if $adv_opts.adv_opts_selector=="advanced": | |
33 $adv_opts.filter_query | |
34 -matrix $adv_opts.matrix | |
35 ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string | |
36 ## Note -max_target_seqs overrides -num_descriptions and -num_alignments | |
37 #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): | |
38 -max_target_seqs $adv_opts.max_hits | |
39 #end if | |
40 #if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): | |
41 -word_size $adv_opts.word_size | |
42 #end if | |
43 ##Ungapped disabled for now - see comments below | |
44 ##$adv_opts.ungapped | |
45 $adv_opts.parse_deflines | |
46 ## End of advanced options: | |
47 #end if | |
48 </command> | |
49 <stdio> | |
50 <!-- Anything other than zero is an error --> | |
51 <exit_code range="1:" /> | |
52 <exit_code range=":-1" /> | |
53 <!-- In case the return code has not been set propery check stderr too --> | |
54 <regex match="Error:" /> | |
55 <regex match="Exception:" /> | |
56 </stdio> | |
57 <inputs> | |
58 <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> | |
59 <conditional name="db_opts"> | |
60 <param name="db_opts_selector" type="select" label="Subject database/sequences"> | |
61 <option value="db" selected="True">Locally installed BLAST database</option> | |
62 <option value="histdb">BLAST database from your history</option> | |
63 <option value="file">FASTA file from your history (see warning note below)</option> | |
64 </param> | |
65 <when value="db"> | |
66 <param name="database" type="select" label="Protein BLAST database"> | |
67 <options from_file="blastdb_p.loc"> | |
68 <column name="value" index="0"/> | |
69 <column name="name" index="1"/> | |
70 <column name="path" index="2"/> | |
71 </options> | |
72 </param> | |
73 <param name="histdb" type="hidden" value="" /> | |
74 <param name="subject" type="hidden" value="" /> | |
75 </when> | |
76 <when value="histdb"> | |
77 <param name="database" type="hidden" value="" /> | |
78 <param name="histdb" type="data" format="blastdbp" label="Protein BLAST database" /> | |
79 <param name="subject" type="hidden" value="" /> | |
80 </when> | |
81 <when value="file"> | |
82 <param name="database" type="hidden" value="" /> | |
83 <param name="histdb" type="hidden" value="" /> | |
84 <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> | |
85 </when> | |
86 </conditional> | |
87 <param name="blast_type" type="select" display="radio" label="Type of BLAST"> | |
88 <option value="blastp">blastp</option> | |
89 <option value="blastp-short">blastp-short</option> | |
90 </param> | |
91 <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> | |
92 <param name="out_format" type="select" label="Output format"> | |
93 <option value="6">Tabular (standard 12 columns)</option> | |
94 <option value="ext" selected="True">Tabular (extended 24 columns)</option> | |
95 <option value="5">BLAST XML</option> | |
96 <option value="0">Pairwise text</option> | |
97 <option value="0 -html">Pairwise HTML</option> | |
98 <option value="2">Query-anchored text</option> | |
99 <option value="2 -html">Query-anchored HTML</option> | |
100 <option value="4">Flat query-anchored text</option> | |
101 <option value="4 -html">Flat query-anchored HTML</option> | |
102 <!-- | |
103 <option value="-outfmt 11">BLAST archive format (ASN.1)</option> | |
104 --> | |
105 </param> | |
106 <conditional name="adv_opts"> | |
107 <param name="adv_opts_selector" type="select" label="Advanced Options"> | |
108 <option value="basic" selected="True">Hide Advanced Options</option> | |
109 <option value="advanced">Show Advanced Options</option> | |
110 </param> | |
111 <when value="basic" /> | |
112 <when value="advanced"> | |
113 <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> | |
114 <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> | |
115 <param name="matrix" type="select" label="Scoring matrix"> | |
116 <option value="BLOSUM90">BLOSUM90</option> | |
117 <option value="BLOSUM80">BLOSUM80</option> | |
118 <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option> | |
119 <option value="BLOSUM50">BLOSUM50</option> | |
120 <option value="BLOSUM45">BLOSUM45</option> | |
121 <option value="PAM250">PAM250</option> | |
122 <option value="PAM70">PAM70</option> | |
123 <option value="PAM30">PAM30</option> | |
124 </param> | |
125 <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> | |
126 <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> | |
127 <validator type="in_range" min="0" /> | |
128 </param> | |
129 <!-- I'd like word_size to be optional, with minimum 2 for blastp --> | |
130 <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> | |
131 <validator type="in_range" min="0" /> | |
132 </param> | |
133 <!-- | |
134 Can't use '-ungapped' on its own, error back is: | |
135 Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search | |
136 Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.' | |
137 <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> | |
138 --> | |
139 <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> | |
140 </when> | |
141 </conditional> | |
142 </inputs> | |
143 <outputs> | |
144 <data name="output1" format="tabular" label="${blast_type.value_label} on ${on_string}"> | |
145 <change_format> | |
146 <when input="out_format" value="0" format="txt"/> | |
147 <when input="out_format" value="0 -html" format="html"/> | |
148 <when input="out_format" value="2" format="txt"/> | |
149 <when input="out_format" value="2 -html" format="html"/> | |
150 <when input="out_format" value="4" format="txt"/> | |
151 <when input="out_format" value="4 -html" format="html"/> | |
152 <when input="out_format" value="5" format="blastxml"/> | |
153 </change_format> | |
154 </data> | |
155 </outputs> | |
156 <tests> | |
157 <test> | |
158 <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> | |
159 <param name="db_opts_selector" value="file" /> | |
160 <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" /> | |
161 <param name="database" value="" /> | |
162 <param name="evalue_cutoff" value="1e-8" /> | |
163 <param name="blast_type" value="blastp" /> | |
164 <param name="out_format" value="5" /> | |
165 <param name="adv_opts_selector" value="advanced" /> | |
166 <param name="filter_query" value="False" /> | |
167 <param name="matrix" value="BLOSUM62" /> | |
168 <param name="max_hits" value="0" /> | |
169 <param name="word_size" value="0" /> | |
170 <param name="parse_deflines" value="True" /> | |
171 <output name="output1" file="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" /> | |
172 </test> | |
173 <test> | |
174 <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> | |
175 <param name="db_opts_selector" value="file" /> | |
176 <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" /> | |
177 <param name="database" value="" /> | |
178 <param name="evalue_cutoff" value="1e-8" /> | |
179 <param name="blast_type" value="blastp" /> | |
180 <param name="out_format" value="6" /> | |
181 <param name="adv_opts_selector" value="advanced" /> | |
182 <param name="filter_query" value="False" /> | |
183 <param name="matrix" value="BLOSUM62" /> | |
184 <param name="max_hits" value="0" /> | |
185 <param name="word_size" value="0" /> | |
186 <param name="parse_deflines" value="True" /> | |
187 <output name="output1" file="blastp_four_human_vs_rhodopsin.tabular" ftype="tabular" /> | |
188 </test> | |
189 <test> | |
190 <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> | |
191 <param name="db_opts_selector" value="file" /> | |
192 <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" /> | |
193 <param name="database" value="" /> | |
194 <param name="evalue_cutoff" value="1e-8" /> | |
195 <param name="blast_type" value="blastp" /> | |
196 <param name="out_format" value="ext" /> | |
197 <param name="adv_opts_selector" value="advanced" /> | |
198 <param name="filter_query" value="False" /> | |
199 <param name="matrix" value="BLOSUM62" /> | |
200 <param name="max_hits" value="0" /> | |
201 <param name="word_size" value="0" /> | |
202 <param name="parse_deflines" value="True" /> | |
203 <output name="output1" file="blastp_four_human_vs_rhodopsin_ext.tabular" ftype="tabular" /> | |
204 </test> | |
205 <test> | |
206 <param name="query" value="rhodopsin_proteins.fasta" ftype="fasta" /> | |
207 <param name="db_opts_selector" value="file" /> | |
208 <param name="subject" value="four_human_proteins.fasta" ftype="fasta" /> | |
209 <param name="database" value="" /> | |
210 <param name="evalue_cutoff" value="1e-8" /> | |
211 <param name="blast_type" value="blastp" /> | |
212 <param name="out_format" value="6" /> | |
213 <param name="adv_opts_selector" value="basic" /> | |
214 <output name="output1" file="blastp_rhodopsin_vs_four_human.tabular" ftype="tabular" /> | |
215 </test> | |
216 </tests> | |
217 <help> | |
218 | |
219 .. class:: warningmark | |
220 | |
221 **Note**. Database searches may take a substantial amount of time. | |
222 For large input datasets it is advisable to allow overnight processing. | |
223 | |
224 ----- | |
225 | |
226 **What it does** | |
227 | |
228 Search a *protein database* using a *protein query*, | |
229 using the NCBI BLAST+ blastp command line tool. | |
230 | |
231 .. class:: warningmark | |
232 | |
233 You can also search against a FASTA file of subject protein | |
234 sequences. This is *not* advised because it is slower (only one | |
235 CPU is used), but more importantly gives e-values for pairwise | |
236 searches (very small e-values which will look overly signficiant). | |
237 In most cases you should instead turn the other FASTA file into a | |
238 database first using *makeblastdb* and search against that. | |
239 | |
240 ----- | |
241 | |
242 **Output format** | |
243 | |
244 Because Galaxy focuses on processing tabular data, the default output of this | |
245 tool is tabular. The standard BLAST+ tabular output contains 12 columns: | |
246 | |
247 ====== ========= ============================================ | |
248 Column NCBI name Description | |
249 ------ --------- -------------------------------------------- | |
250 1 qseqid Query Seq-id (ID of your sequence) | |
251 2 sseqid Subject Seq-id (ID of the database hit) | |
252 3 pident Percentage of identical matches | |
253 4 length Alignment length | |
254 5 mismatch Number of mismatches | |
255 6 gapopen Number of gap openings | |
256 7 qstart Start of alignment in query | |
257 8 qend End of alignment in query | |
258 9 sstart Start of alignment in subject (database hit) | |
259 10 send End of alignment in subject (database hit) | |
260 11 evalue Expectation value (E-value) | |
261 12 bitscore Bit score | |
262 ====== ========= ============================================ | |
263 | |
264 The BLAST+ tools can optionally output additional columns of information, | |
265 but this takes longer to calculate. Most (but not all) of these columns are | |
266 included by selecting the extended tabular output. The extra columns are | |
267 included *after* the standard 12 columns. This is so that you can write | |
268 workflow filtering steps that accept either the 12 or 24 column tabular | |
269 BLAST output. Galaxy now uses this extended 24 column output by default. | |
270 | |
271 ====== ============= =========================================== | |
272 Column NCBI name Description | |
273 ------ ------------- ------------------------------------------- | |
274 13 sallseqid All subject Seq-id(s), separated by a ';' | |
275 14 score Raw score | |
276 15 nident Number of identical matches | |
277 16 positive Number of positive-scoring matches | |
278 17 gaps Total number of gaps | |
279 18 ppos Percentage of positive-scoring matches | |
280 19 qframe Query frame | |
281 20 sframe Subject frame | |
282 21 qseq Aligned part of query sequence | |
283 22 sseq Aligned part of subject sequence | |
284 23 qlen Query sequence length | |
285 24 slen Subject sequence length | |
286 ====== ============= =========================================== | |
287 | |
288 The third option is BLAST XML output, which is designed to be parsed by | |
289 another program, and is understood by some Galaxy tools. | |
290 | |
291 You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). | |
292 The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. | |
293 The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. | |
294 The two query anchored outputs show a multiple sequence alignment between the query and all the matches, | |
295 and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). | |
296 | |
297 ------- | |
298 | |
299 **References** | |
300 | |
301 Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. | |
302 | |
303 Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. | |
304 | |
305 This wrapper is available to install into other Galaxy Instances via the Galaxy | |
306 Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus | |
307 </help> | |
308 </tool> |