Mercurial > repos > iuc > prot_scriber
comparison prot-scriber.xml @ 0:278aa57e2c4d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit 8b58f3f03d6430689d228029bb2eb46c16cfff23
| author | iuc |
|---|---|
| date | Tue, 10 May 2022 13:17:28 +0000 |
| parents | |
| children | a830e9f84593 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:278aa57e2c4d |
|---|---|
| 1 <tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05"> | |
| 2 <description>Protein annotation of short human readable descriptions</description> | |
| 3 <macros> | |
| 4 <token name="@TOOL_VERSION@">0.1.1</token> | |
| 5 </macros> | |
| 6 <requirements> | |
| 7 <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement> | |
| 8 </requirements> | |
| 9 <stdio> | |
| 10 <regex match="panicked" level="fatal" source="stderr" /> | |
| 11 </stdio> | |
| 12 <command> | |
| 13 <![CDATA['prot-scriber' | |
| 14 #if str($input_config.input_config_selector) == "basic" | |
| 15 #for $sst in $input_config.seq_sim_table | |
| 16 -s '$sst' | |
| 17 #end for | |
| 18 #else if str($input_config.input_config_selector) == "advanced" | |
| 19 #for $ssr in $input_config.advanced_input_repeat | |
| 20 -s '$ssr.seq_sim_table' | |
| 21 #if $ssr.header | |
| 22 -e '$ssr.header' | |
| 23 #end if | |
| 24 #if $ssr.field_separator | |
| 25 -p '$ssr.field_separator' | |
| 26 #end if | |
| 27 #if $ssr.blacklist_regexs | |
| 28 -b '$ssr.blacklist_regexs' | |
| 29 #end if | |
| 30 #if $ssr.capture_replace_pairs | |
| 31 -c '$ssr.capture_replace_pairs' | |
| 32 #end if | |
| 33 #if $ssr.filter_regexs | |
| 34 -l '$ssr.filter_regexs' | |
| 35 #end if | |
| 36 #end for | |
| 37 #if $input_config.expert_options.non_informative_words_regexs | |
| 38 -w '$input_config.expert_options.non_informative_words_regexs' | |
| 39 #end if | |
| 40 #if $input_config.expert_options.description_split_regex | |
| 41 -r "$input_config.expert_options.description_split_regex" | |
| 42 #end if | |
| 43 #if $input_config.expert_options.center_inverse_word_information_content_at_quantile | |
| 44 -q $input_config.expert_options.center_inverse_word_information_content_at_quantile | |
| 45 #end if | |
| 46 #end if | |
| 47 #if $seq_family.seq_families | |
| 48 -f '$seq_families' | |
| 49 #end if | |
| 50 #if $seq_family.annotate_non_family_queries | |
| 51 -a | |
| 52 #end if | |
| 53 #if $seq_family.seq_family_gene_ids_separator | |
| 54 -g "$seq_family_gene_ids_separator" | |
| 55 #end if | |
| 56 #if $seq_family.seq_family_id_genes_separator | |
| 57 -i '$seq_family_id_genes_separator' | |
| 58 #end if | |
| 59 -o '$output' | |
| 60 ]]> | |
| 61 </command> | |
| 62 <inputs> | |
| 63 <conditional name="input_config"> | |
| 64 <param type="select" name="input_config_selector" label="Choose input configuration options"> | |
| 65 <option value="basic" selected="true">Basic</option> | |
| 66 <option value="advanced">Advanced</option> | |
| 67 </param> | |
| 68 <when value="basic"> | |
| 69 <param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. | |
| 70 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> | |
| 71 </when> | |
| 72 <when value="advanced"> | |
| 73 <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1"> | |
| 74 <param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. | |
| 75 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> | |
| 76 <param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default"> | |
| 77 <sanitizer> | |
| 78 <valid initial="default"> | |
| 79 <add preset="string.printable" /> | |
| 80 </valid> | |
| 81 </sanitizer> | |
| 82 </param> | |
| 83 <param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the | |
| 84 in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default" /> | |
| 85 <param type="data" optional="true" name="blacklist_regexs" argument="-b" format="tabular" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these | |
| 86 regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default" /> | |
| 87 <param type="data" optional="true" name="capture_replace_pairs" argument="-c" format="tabular" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions | |
| 88 defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default" /> | |
| 89 <param type="data" optional="true" name="filter_regexs" argument="-l" format="tabular" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these | |
| 90 regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default" /> | |
| 91 </repeat> | |
| 92 <section title="Expert options" name="expert_options"> | |
| 93 <param type="data" optional="true" name="non_informative_words_regexs" argument="-w" format="tabular" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These | |
| 94 regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description." /> | |
| 95 <param type="text" optional="true" name="description_split_regex" argument="-r" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast | |
| 96 terminology) into words. Default is '([~_\-/|\;,':.\s]+)'."> | |
| 97 <sanitizer> | |
| 98 <valid initial="default"> | |
| 99 <add preset="string.printable" /> | |
| 100 </valid> | |
| 101 </sanitizer> | |
| 102 </param> | |
| 103 <param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information | |
| 104 content to center these values. Value between 0 and 1." /> | |
| 105 </section> | |
| 106 </when> | |
| 107 </conditional> | |
| 108 <section title="Sequence family annotation" name="seq_family"> | |
| 109 <param type="data" optional="true" name="seq_families" argument="-f" format="tabular" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each | |
| 110 line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in | |
| 111 more than one family." /> | |
| 112 <param type="boolean" optional="true" name="annotate_non_family_queries" argument="-a" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family." /> | |
| 113 <param type="text" optional="true" name="seq_family_gene_ids_separator" argument="-g" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the | |
| 114 argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'."> | |
| 115 <sanitizer> | |
| 116 <valid initial="default"> | |
| 117 <add preset="string.printable" /> | |
| 118 </valid> | |
| 119 </sanitizer> | |
| 120 </param> | |
| 121 <param type="text" optional="true" name="seq_family_id_genes_separator" argument="-i" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This | |
| 122 string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'."> | |
| 123 <sanitizer> | |
| 124 <valid initial="default"> | |
| 125 <add preset="string.printable" /> | |
| 126 </valid> | |
| 127 </sanitizer> | |
| 128 </param> | |
| 129 </section> | |
| 130 </inputs> | |
| 131 <outputs> | |
| 132 <data format="tabular" name="output" /> | |
| 133 </outputs> | |
| 134 <tests> | |
| 135 <test> | |
| 136 <param name="input_config_selector" value="basic"/> | |
| 137 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> | |
| 138 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> | |
| 139 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> | |
| 140 </test> | |
| 141 <test> | |
| 142 <param name="input_config_selector" value="advanced" /> | |
| 143 <repeat name="advanced_input_repeat"> | |
| 144 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> | |
| 145 <param name="field_separator" value="default" /> | |
| 146 <param name="header" value="qacc sacc stitle" /> | |
| 147 </repeat> | |
| 148 <repeat name="advanced_input_repeat"> | |
| 149 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> | |
| 150 <param name="field_separator" value="default" /> | |
| 151 <param name="header" value="qacc sacc stitle" /> | |
| 152 </repeat> | |
| 153 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> | |
| 154 </test> | |
| 155 <test> | |
| 156 <param name="input_config_selector" value="advanced" /> | |
| 157 <repeat name="advanced_input_repeat"> | |
| 158 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> | |
| 159 <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> | |
| 160 </repeat> | |
| 161 <repeat name="advanced_input_repeat"> | |
| 162 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> | |
| 163 <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> | |
| 164 </repeat> | |
| 165 <param name="description_split_regex" value="([~_\-/|;,':.\s]+)" /> | |
| 166 <param name="center_inverse_word_information_content_at_quantile" value="50" /> | |
| 167 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> | |
| 168 </test> | |
| 169 </tests> | |
| 170 <help> | |
| 171 <![CDATA[ | |
| 172 | |
| 173 **What it does** | |
| 174 | |
| 175 prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions. | |
| 176 In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format. | |
| 177 customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences. | |
| 178 For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_. | |
| 179 | |
| 180 .. _prot-scriber: http://github.com/usadellab/prot-scriber | |
| 181 .. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md | |
| 182 .. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual | |
| 183 | |
| 184 ---- | |
| 185 | |
| 186 **Input** | |
| 187 | |
| 188 The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar). | |
| 189 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter:: | |
| 190 | |
| 191 -s, --seq-sim-table | |
| 192 File in which to find sequence similarity search results in tabular format (SSST). Use | |
| 193 e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast) | |
| 194 or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than | |
| 195 shown here you must use the --header (-e) argument. If any of the input SSSTs uses a | |
| 196 different field-separator than the '<TAB>' character, you must provide the --field- | |
| 197 separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information | |
| 198 will be combined and evaluated by the tool. | |
| 199 | |
| 200 **Input parameters** | |
| 201 | |
| 202 prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables. | |
| 203 To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults. | |
| 204 In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the | |
| 205 customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match. | |
| 206 You can set the values to 'default' if you want to use the default value for a given input table:: | |
| 207 | |
| 208 -e, --header | |
| 209 Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the | |
| 210 columns in order of appearance in the respective table. Required and default columns are | |
| 211 'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even | |
| 212 if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of | |
| 213 'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns | |
| 214 that will be ignored, as long as the required columns appear in the correct order. | |
| 215 Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default. | |
| 216 | |
| 217 -p, --field-separator | |
| 218 Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character. | |
| 219 Consider this example: ','. You can provide 'default' to use the hard coded default (TAB). | |
| 220 | |
| 221 -b, --blacklist-regexs (Expert option) | |
| 222 A file with regular expressions, one per line. Any match to any of these | |
| 223 regular expressions causes sequence similarity search result descriptions ('stitle' in | |
| 224 Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard | |
| 225 coded default. An example file can be downloaded here: | |
| 226 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt. | |
| 227 | |
| 228 -l, --filter-regexs (Expert option) | |
| 229 A file with regular expressions, one per line. Any match to any of these | |
| 230 regular expressions causes the matched sub-string to be deleted, i.e. filtered out. | |
| 231 Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare | |
| 232 the descriptions for the prot-scriber annotation process. In case of UniProt sequence | |
| 233 similarity search results (Blast result tables), this removes the Blast Hit identifier | |
| 234 (`sacc`) from the description (`stitle`) and also removes the taxonomic information | |
| 235 starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use | |
| 236 hard coded default. Anexample file can be downloaded here: | |
| 237 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt. | |
| 238 | |
| 239 -c, --capture-replace-pairs (Expert option) | |
| 240 A file with pairs of lines. Within each pair the first line is a regular expressions | |
| 241 defining one or more capture groups. The second line of a pair is the | |
| 242 string used to replace the match in the regular expression with. This means the second | |
| 243 line contains the capture groups. These pairs are used to further filter | |
| 244 the sequence similarity search result descriptions ('stitle' in Blast terminology). In | |
| 245 contrast to the --filter-regex (-l) matches are not deleted, but replaced with the | |
| 246 second line of the pair. Filtering is used to process descriptions ('stitle' in Blast | |
| 247 terminology) and prepare the descriptions for the prot-scriber annotation process. | |
| 248 Set to 'default' to use the hard coded default. An example file can be downloaded here: | |
| 249 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt. | |
| 250 | |
| 251 ---- | |
| 252 | |
| 253 **Gene family annotation** | |
| 254 | |
| 255 prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families:: | |
| 256 | |
| 257 -f, --seq-families | |
| 258 A file in which families of biological sequences are stored, one family per line. Each | |
| 259 line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in | |
| 260 more than one family. | |
| 261 | |
| 262 -g, --seq-family-gene-ids-separator | |
| 263 A regular expression used to split the list of gene-identifiers in the | |
| 264 argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'. | |
| 265 | |
| 266 -a, --annotate-non-family-queries | |
| 267 Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is | |
| 268 used to generate human readable descriptions for gene families. If in that context this | |
| 269 flag is given, queries for which there are sequence similarity search (Blast) results | |
| 270 but that are NOT member of a sequence family will receive an annotation (human readable | |
| 271 description) in the output file, too. Default value of this setting is 'OFF' (false). | |
| 272 | |
| 273 ---- | |
| 274 | |
| 275 **Expert options** | |
| 276 | |
| 277 Some additional optional configuration. Only use when you know what you are doing:: | |
| 278 | |
| 279 -w, --non-informative-words-regexs | |
| 280 A file in which regular expressions (regexs) are stored, one per line. These | |
| 281 regexs are used to recognize non-informative words, which will only receive a minimun | |
| 282 score in the prot-scriber process that generates human readable description. There is a | |
| 283 default list hard-coded into prot-scriber. An example file can be downloaded here: | |
| 284 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt. | |
| 285 | |
| 286 -r, --description-split-regex | |
| 287 A regular expression to be used to split descriptions (`stitle` in Blast | |
| 288 terminology) into words. Default is '([~_\-/|\;,':.\s]+)'. | |
| 289 | |
| 290 -q, --center-inverse-word-information-content-at-quantile | |
| 291 The quantile (percentile) to be subtracted from calculated inverse word information | |
| 292 content to center these values. Consequently, this must be a value between zero and one | |
| 293 or literal 50, which is interpreted as mean instead of a quantile. Default is 50, | |
| 294 implying centering at the mean. | |
| 295 | |
| 296 ---- | |
| 297 | |
| 298 **Output** | |
| 299 | |
| 300 prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line:: | |
| 301 | |
| 302 Annotee-Identifier Human-Readable-Description | |
| 303 Soltu.DM.02G020600.1 arath strubbelig receptor family | |
| 304 Soltu.DM.S001650.1 germin member | |
| 305 Soltu.DM.03G011280.1 increased dna methylation | |
| 306 ... | |
| 307 | |
| 308 ]]> | |
| 309 </help> | |
| 310 </tool> |
