prot_scriber: prot-scriber.xml comparison

comparison prot-scriber.xml @ 0:278aa57e2c4d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit 8b58f3f03d6430689d228029bb2eb46c16cfff23

author	iuc
date	Tue, 10 May 2022 13:17:28 +0000
parents
children	a830e9f84593

comparison

equal deleted inserted replaced

--1:000000000000
+:278aa57e2c4d
+<tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05">
+<description>Protein annotation of short human readable descriptions</description>
+<macros>
+<token name="@TOOL_VERSION@">0.1.1</token>
+</macros>
+<requirements>
+<requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement>
+</requirements>
+<stdio>
+<regex match="panicked" level="fatal" source="stderr" />
+</stdio>
+<command>
+<![CDATA['prot-scriber'
+#if str($input_config.input_config_selector) == "basic"
+#for $sst in $input_config.seq_sim_table
+-s '$sst'
+#end for
+#else if str($input_config.input_config_selector) == "advanced"
+#for $ssr in $input_config.advanced_input_repeat
+-s '$ssr.seq_sim_table'
+#if $ssr.header
+-e '$ssr.header'
+#end if
+#if $ssr.field_separator
+-p '$ssr.field_separator'
+#end if
+#if $ssr.blacklist_regexs
+-b '$ssr.blacklist_regexs'
+#end if
+#if $ssr.capture_replace_pairs
+-c '$ssr.capture_replace_pairs'
+#end if
+#if $ssr.filter_regexs
+-l '$ssr.filter_regexs'
+#end if
+#end for
+#if $input_config.expert_options.non_informative_words_regexs
+-w '$input_config.expert_options.non_informative_words_regexs'
+#end if
+#if $input_config.expert_options.description_split_regex
+-r "$input_config.expert_options.description_split_regex"
+#end if
+#if $input_config.expert_options.center_inverse_word_information_content_at_quantile
+-q $input_config.expert_options.center_inverse_word_information_content_at_quantile
+#end if
+#end if
+#if $seq_family.seq_families
+-f '$seq_families'
+#end if
+#if $seq_family.annotate_non_family_queries
+-a
+#end if
+#if $seq_family.seq_family_gene_ids_separator
+-g "$seq_family_gene_ids_separator"
+#end if
+#if $seq_family.seq_family_id_genes_separator
+-i '$seq_family_id_genes_separator'
+#end if
+-o '$output'
+]]>
+</command>
+<inputs>
+<conditional name="input_config">
+<param type="select" name="input_config_selector" label="Choose input configuration options">
+<option value="basic" selected="true">Basic</option>
+<option value="advanced">Advanced</option>
+</param>
+<when value="basic">
+<param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.
+Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." />
+</when>
+<when value="advanced">
+<repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1">
+<param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.
+Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." />
+<param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default">
+<sanitizer>
+<valid initial="default">
+<add preset="string.printable" />
+</valid>
+</sanitizer>
+</param>
+<param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the
+in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default" />
+<param type="data" optional="true" name="blacklist_regexs" argument="-b" format="tabular" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these
+regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default" />
+<param type="data" optional="true" name="capture_replace_pairs" argument="-c" format="tabular" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions
+defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default" />
+<param type="data" optional="true" name="filter_regexs" argument="-l" format="tabular" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these
+regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default" />
+</repeat>
+<section title="Expert options" name="expert_options">
+<param type="data" optional="true" name="non_informative_words_regexs" argument="-w" format="tabular" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These
+regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description." />
+<param type="text" optional="true" name="description_split_regex" argument="-r" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast
+terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.">
+<sanitizer>
+<valid initial="default">
+<add preset="string.printable" />
+</valid>
+</sanitizer>
+</param>
+<param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information
+content to center these values. Value between 0 and 1." />
+</section>
+</when>
+</conditional>
+<section title="Sequence family annotation" name="seq_family">
+<param type="data" optional="true" name="seq_families" argument="-f" format="tabular" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each
+line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in
+more than one family." />
+<param type="boolean" optional="true" name="annotate_non_family_queries" argument="-a" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family." />
+<param type="text" optional="true" name="seq_family_gene_ids_separator" argument="-g" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the
+argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.">
+<sanitizer>
+<valid initial="default">
+<add preset="string.printable" />
+</valid>
+</sanitizer>
+</param>
+<param type="text" optional="true" name="seq_family_id_genes_separator" argument="-i" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This
+string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'.">
+<sanitizer>
+<valid initial="default">
+<add preset="string.printable" />
+</valid>
+</sanitizer>
+</param>
+</section>
+</inputs>
+<outputs>
+<data format="tabular" name="output" />
+</outputs>
+<tests>
+<test>
+<param name="input_config_selector" value="basic"/>
+<param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
+<param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
+<output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
+</test>
+<test>
+<param name="input_config_selector" value="advanced" />
+<repeat name="advanced_input_repeat">
+<param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
+<param name="field_separator" value="default" />
+<param name="header" value="qacc sacc stitle" />
+</repeat>
+<repeat name="advanced_input_repeat">
+<param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
+<param name="field_separator" value="default" />
+<param name="header" value="qacc sacc stitle" />
+</repeat>
+<output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
+</test>
+<test>
+<param name="input_config_selector" value="advanced" />
+<repeat name="advanced_input_repeat">
+<param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
+<param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" />
+</repeat>
+<repeat name="advanced_input_repeat">
+<param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
+<param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" />
+</repeat>
+<param name="description_split_regex" value="([~_\-/|;,':.\s]+)" />
+<param name="center_inverse_word_information_content_at_quantile" value="50" />
+<output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
+</test>
+</tests>
+<help>
+<![CDATA[
+**What it does**
+prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions.
+In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format.
+customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences.
+For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_.
+.. _prot-scriber: http://github.com/usadellab/prot-scriber
+.. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md
+.. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual
+----
+**Input**
+The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar).
+Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter::
+-s, --seq-sim-table
+File in which to find sequence similarity search results in tabular format (SSST). Use
+e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast)
+or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than
+shown here you must use the --header (-e) argument. If any of the input SSSTs uses a
+different field-separator than the '<TAB>' character, you must provide the --field-
+separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information
+will be combined and evaluated by the tool.
+**Input parameters**
+prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables.
+To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults.
+In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the
+customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match.
+You can set the values to 'default' if you want to use the default value for a given input table::
+-e, --header
+Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the
+columns in order of appearance in the respective table. Required and default columns are
+'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even
+if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of
+'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns
+that will be ignored, as long as the required columns appear in the correct order.
+Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default.
+-p, --field-separator
+Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character.
+Consider this example: ','. You can provide 'default' to use the hard coded default (TAB).
+-b, --blacklist-regexs (Expert option)
+A file with regular expressions, one per line. Any match to any of these
+regular expressions causes sequence similarity search result descriptions ('stitle' in
+Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard
+coded default. An example file can be downloaded here:
+https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt.
+-l, --filter-regexs (Expert option)
+A file with regular expressions, one per line. Any match to any of these
+regular expressions causes the matched sub-string to be deleted, i.e. filtered out.
+Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare
+the descriptions for the prot-scriber annotation process. In case of UniProt sequence
+similarity search results (Blast result tables), this removes the Blast Hit identifier
+(`sacc`) from the description (`stitle`) and also removes the taxonomic information
+starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use
+hard coded default. Anexample file can be downloaded here:
+https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt.
+-c, --capture-replace-pairs (Expert option)
+A file with pairs of lines. Within each pair the first line is a regular expressions
+defining one or more capture groups. The second line of a pair is the
+string used to replace the match in the regular expression with. This means the second
+line contains the capture groups. These pairs are used to further filter
+the sequence similarity search result descriptions ('stitle' in Blast terminology). In
+contrast to the --filter-regex (-l) matches are not deleted, but replaced with the
+second line of the pair. Filtering is used to process descriptions ('stitle' in Blast
+terminology) and prepare the descriptions for the prot-scriber annotation process.
+Set to 'default' to use the hard coded default. An example file can be downloaded here:
+https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt.
+----
+**Gene family annotation**
+prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families::
+-f, --seq-families
+A file in which families of biological sequences are stored, one family per line. Each
+line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in
+more than one family.
+-g, --seq-family-gene-ids-separator
+A regular expression used to split the list of gene-identifiers in the
+argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.
+-a, --annotate-non-family-queries
+Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is
+used to generate human readable descriptions for gene families. If in that context this
+flag is given, queries for which there are sequence similarity search (Blast) results
+but that are NOT member of a sequence family will receive an annotation (human readable
+description) in the output file, too. Default value of this setting is 'OFF' (false).
+----
+**Expert options**
+Some additional optional configuration. Only use when you know what you are doing::
+-w, --non-informative-words-regexs
+A file in which regular expressions (regexs) are stored, one per line. These
+regexs are used to recognize non-informative words, which will only receive a minimun
+score in the prot-scriber process that generates human readable description. There is a
+default list hard-coded into prot-scriber. An example file can be downloaded here:
+https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt.
+-r, --description-split-regex
+A regular expression to be used to split descriptions (`stitle` in Blast
+terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.
+-q, --center-inverse-word-information-content-at-quantile
+The quantile (percentile) to be subtracted from calculated inverse word information
+content to center these values. Consequently, this must be a value between zero and one
+or literal 50, which is interpreted as mean instead of a quantile. Default is 50,
+implying centering at the mean.
+----
+**Output**
+prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line::
+Annotee-Identifier	Human-Readable-Description
+Soltu.DM.02G020600.1	arath strubbelig receptor family
+Soltu.DM.S001650.1	germin member
+Soltu.DM.03G011280.1	increased dna methylation
+...
+]]>
+</help>
+</tool>

Mercurial > repos > iuc > prot_scriber

comparison prot-scriber.xml @ 0:278aa57e2c4d draft