annotate patser-v3e.xml @ 1:4d9823e0f6f7 draft default tip

planemo upload for repository http://stormo.wustl.edu/resources.html
author mvdbeek
date Mon, 29 Jun 2015 05:57:00 -0400
parents f9ab3aa3e538
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
4d9823e0f6f7 planemo upload for repository http://stormo.wustl.edu/resources.html
mvdbeek
parents: 0
diff changeset
1 <tool id="patser-v3e" name="patser" version="0.1.2">
4d9823e0f6f7 planemo upload for repository http://stormo.wustl.edu/resources.html
mvdbeek
parents: 0
diff changeset
2 <description>finds putative transcription factor binding sites</description>
0
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
3 <requirements>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
4 <requirement type="package" version="v3e">patser</requirement>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
5 </requirements>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
6 <stdio>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
7 <exit_code range="1:" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
8 </stdio>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
9
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
10 <command><![CDATA[
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
11 ## We need to transform the fasta input file into the awkward format in that patser can work on
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
12 ## The fasta header must be followed by the nucleotide sequence encapsulated by backslashes.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
13 ## We simply add backslashes before and after each fasta header and skip the first line,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
14 ## and we add a final backslash at the end of the file.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
15 awk '/>/{print "\\"}1' "$input_fasta"|awk '/>/{print;print "\\";next}1'|tail -n +2 >> special.fa;
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
16 echo "\\" >> special.fa;
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
17 patser-v3e -A a:t "$at" c:g "$gc" -m "$input_matrix" -b "$b" $c -d1 -ls "$ls" -f special.fa "$p" > "$output1"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
18 ]]></command>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
19 <inputs>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
20 <param type="data" name="input_matrix" format="txt" help="Provide alignment matrix file"/>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
21 <param type="data" name="input_fasta" format="fasta" help="Fasta file with sequence"/>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
22 <param name="v" type="boolean" label="the matrix is a vertical matrix (default: horizontal matrix)"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
23 truevalue="-v" falsevalue=""
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
24 help="commandline option -v" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
25 <param name="b" type="integer" label="Correction added to the elements of the alignment matrix"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
26 value="1"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
27 help="commandline option -b" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
28 <param name="gc" type="float" label="Enter the GC frequency"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
29 value="0.25" min="0" max="1"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
30 help="commandline option -A gc:(value)" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
31 <param name="at" type="float" label="Enter the AT frequency"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
32 value="0.25" min="0" max="1"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
33 help="commandline option -A at:" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
34 <param name="c" type="boolean" label="Also score the complementary sequences"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
35 truevalue="-c" falsevalue="" checked="true"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
36 help="commandline option -c: Also score the complementary sequences. The complements are determined by the program and are not explicitly stated in the sequence fasta" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
37 <param name="p" type="boolean" label="print the weight matrix derived from the alignment matrix"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
38 truevalue="-p" falsevalue="" checked="true"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
39 help="commandline option -p" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
40 <param name="ls" type="float" label="Lower-threshold score, inclusive"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
41 value="7"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
42 help="commandline option -ls" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
43 </inputs>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
44 <outputs>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
45 <data name="output1" format="txt" from_work_dir="output.txt" />
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
46 </outputs>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
47 <tests>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
48 <test>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
49 <param name="input_matrix" value="PWM_training_EcR-USP.txt"/>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
50 <param name="input_fasta" value="EcR_USP_224.fa"/>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
51 <output name="output1" file="output.txt" lines_diff="6"/>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
52 </test>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
53 </tests>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
54 <help><![CDATA[
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
55
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
56 This wrapper has been written by Marius van den Beek (m.vandenbeek at gmail.com).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
57 Patser is available from http://stormo.wustl.edu/resources.html .
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
58
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
59 -------------------------------------------------------------------------------
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
60
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
61 The following options can be determined on the command line:
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
62
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
63 ::
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
64
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
65 0) -h: print these directions.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
66
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
67 1) Matrix options.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
68 -m filename: (default name is "matrix") file containing the matrix.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
69 -w: the matrix is a weight matrix (default: alignment matrix)
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
70 -b number: a non-negative number indicating the total number of
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
71 pseudo-counts added to each alignment position (default: 1).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
72 Before converting an alignment matrix to a weight matrix, the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
73 total pseudo-counts multiplied by the a priori probability
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
74 (see section 3 below) of the corresponding letter is added
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
75 to each matrix element.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
76 -v: the matrix is a vertical matrix (default: horizontal matrix).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
77 -p: print the weight matrix derived from the alignment matrix.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
78
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
79 2) -f filename: this file (default: read from the standard input) contains
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
80 the names of the sequences. The corresponding sequence may follow
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
81 its name if the sequence is enclosed between backslashes (\).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
82 Otherwise, the sequence is assumed to be in a separate file having
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
83 the indicated name.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
84
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
85 In the sequences, whitespace, slashes (/), periods, dashes (unless
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
86 part of an integer when the "-i" option is used), and comments
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
87 beginning with ';', '%', or '#' are ignored. When using letter
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
88 characters (i.e., with the "-a" or "-A" alphabet option), integers
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
89 are also ignored so that the sequence file can contain positional
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
90 information. When using integer characters (i.e., with the "-i"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
91 alphabet option) the integers must be separated by whitespace.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
92
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
93 A "-c" preceding the name of a sequence file indicates that the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
94 corresponding sequence is circular.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
95
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
96 3) Alphabet options---the three options in this section are mutually
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
97 exclusive (default: "-a alphabet"). The a priori probabilities mentioned
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
98 below are used when converting an alignment matrix to a weight matrix.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
99 -a filename: file containing the alphabet and normalization information.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
100
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
101 Each line contains a letter (a symbol in the alphabet) followed by an
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
102 optional normalization number (default: 1.0). The normalization is
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
103 based on the relative a priori probabilities of the letters. For
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
104 nucleic acids, this might be be the genomic frequency of the bases
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
105 or the frequencies observed in the data used to generate the alignment.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
106 In nucleic acid alphabets, a letter and its complement appear on the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
107 same line, separated by a colon (a letter can be its own complement,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
108 e.g. when using a dimer alphabet). Complementary letters may use the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
109 same normalization number. Only the standard 26 letters are
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
110 permissible; however, when the "-CS" option is used, the alphabet is
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
111 case sensitive so that a total of 52 different characters are possible.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
112
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
113 POSSIBLE LINE FORMATS WITHOUT COMPLEMENTARY LETTERS:
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
114 letter
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
115 letter normalization
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
116
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
117 POSSIBLE LINE FORMATS WITH COMPLEMENTARY LETTERS:
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
118 letter:complement
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
119 letter:complement normalization
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
120 letter:complement normalization:complement's_normalization
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
121
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
122 -i filename: same as the "-a" option, except that the symbols of
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
123 the alphabet are represented by integers rather than by letters.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
124 Any integer permitted by the machine is a permissible symbol.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
125
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
126 -A alphabet_and_normalization_information: same as "-a" option, except
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
127 information appears on the command line (e.g., -A a:t 3 c:g 2).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
128
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
129 4) Alphabet modifiers indicating whether ascii alphabets are case
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
130 sensitive---the two options in this section are mutually exclusive
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
131 with each other and with the "-i" option (default: ascii alphabets are
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
132 case insensitive).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
133 -CS: ascii alphabets are case sensitive.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
134 -CM: ascii alphabets are case insensitive, but mark the location
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
135 of lowercase letters by printing a line containing their locations.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
136 This option is useful when lowercase letters indicate a functional
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
137 landmark such as a transcriptional start in a DNA sequence.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
138
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
139 5) Options for adjusting or restricting which information
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
140 and scores are printed.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
141 The "-ls", "-li", and "-lp" options are mutually exclusive.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
142 -c: also score the complementary sequences. The complements are
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
143 determined by the program and are not explicitly stated in the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
144 sequence files.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
145 -ls number: lower threshold for printing scores, inclusive
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
146 (formerly the -l option).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
147 -li: assume that the maximum ln(p-value) for printing scores equals
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
148 the negative of the sample-size adjusted information content;
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
149 indirectly determines the lower threshold for printing scores.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
150 -lp number: the maximum ln(p-value) for printing scores; indirectly
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
151 determines the lower threshold for printing scores.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
152 -u number: upper threshold for printing scores, exclusive.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
153
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
154 -t: just print the top score for each sequence.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
155 -t number: print the indicated number of top scores for each sequence.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
156 -ds: if the "-t number" option is used, print the top scores for each
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
157 sequence in the order of decreasing score (default: order the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
158 scores according to their position within the sequence).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
159 -e number: the small difference for considering 2 scores equal
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
160 (default: 0.000001)
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
161
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
162 -s: print the sequence corresponding to each score that is printed.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
163
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
164 6) Options indicating how unrecognized symbols are treated (default: -d1).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
165 Symbols are letters when option "-a" or "-A" is used;
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
166 symbols are integers when option "-i" is used.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
167 The following three options are mutually exclusive.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
168 -d0: treat unrecognized symbols as errors and exit the program.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
169 -d1: treat unrecognized symbols as discontinuities, but print a warning.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
170 Treating a symbol as a discontinuity means that any L-mer
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
171 containing the unrecognized symbol will be ignored.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
172 -d2: treat unrecognized symbols as discontinuities, and print NO warning.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
173
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
174 7) Options for adjusting the estimation of p-value.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
175 If the -R option is set to zero, the p-value is not estimated.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
176 -R number: the range for approximating a column of the weight matrix with
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
177 integers (default: 10000). This number is the difference
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
178 between the largest and smallest integers used to estimate
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
179 the scores. Higher values increase precision, but will take
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
180 longer to calculate the table of p-values.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
181 -M number: the minimum score for approximating p-values (default: 0).
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
182 Higher values will increase precision,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
183 but may miss interesting scores.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
184
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
185
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
186 ::
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
187
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
188 ----------------------------------------------------------------------
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
189
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
190 Copyright 1990, 1994, 1995, 1996, 2000, 2001, 2002 Gerald Z. Hertz
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
191 May be copied for noncommercial purposes.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
192
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
193 Author:
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
194 Gerald Z. Hertz
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
195 gzhertz AT alum.mit.edu
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
196
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
197 PATSER (version 3e)
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
198
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
199 This program scores the L-mers (subsequences of length L) of the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
200 indicated sequences against the indicated alignment or weight matrix.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
201 The elements of an alignment matrix are simply the number of times
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
202 that the indicated letter is observed at the indicated position of a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
203 sequence alignment. Such elements must be processed before the matrix
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
204 can be used to score an L-mer (e.g., Hertz and Stormo, 1999,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
205 Bioinformatics, 15:563-577). A weight matrix is a matrix whose
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
206 elements are in a form considered appropriate for scoring an L-mer.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
207
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
208 Each element of an alignment matrix is converted to an element of a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
209 weight matrix by first adding pseudo-counts in proportion to the a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
210 priori probability of the corresponding letter (see option "-b" in
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
211 section 1 below) and dividing by the total number of sequences plus
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
212 the total number of pseudo-counts. The resulting frequency is
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
213 normalized by the a priori probability for the corresponding letter.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
214 The final quotient is converted to an element of a weight matrix by
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
215 taking its natural logarithm. The use of pseudo-counts here differs
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
216 from previous versions of this program by being proportional to the a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
217 priori probability.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
218
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
219 Version 3 of this program differs from previous versions by also
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
220 numerically estimating the p-value of the scores. The p-value
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
221 calculated here is the probability of observing a particular score or
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
222 higher at a particular sequence position and does NOT account for the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
223 total amount of sequence being scored. P-values are estimated by the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
224 method described in Staden, 1989, CABIOS, p. 89--96. The relative
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
225 value for each element of the weight matrix is approximated by
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
226 integers in a range determined by the "-R" and "-M" options (section 6
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
227 below). The p-value is calculated for each possible integer score and
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
228 the values are stored. The actual scores for the sequences are
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
229 determined from the true weight matrix. The true scores are converted
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
230 to their corresponding integer values and their p-values are looked up.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
231
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
232 Matrices can be either horizontal or vertical. In a horizontal
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
233 matrix, the columns correspond to the positions within the pattern,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
234 and the rows correspond to the letters. Each row begins with the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
235 corresponding letter (or integer, if the "-i" option is used). In a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
236 vertical matrix, the rows correspond to the positions within the
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
237 pattern, and the columns correspond to the letters. The first row
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
238 contains the letters (or integers, if the "-i" option is used)
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
239 corresponding to each column. In both types of matrices, spaces,
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
240 tabs, and vertical bars (|) are ignored. The output of the "consensus"
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
241 and "wconsensus" programs consists of horizontal alignment matrices.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
242
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
243 The input files can contain comments according to the following
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
244 convention. The portion of a line following a ';', '%', or '#' is
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
245 considered a comment and is ignored. Comments can begin anywhere in a
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
246 line and always end at the end of the line. The output of this
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
247 program is sent to the standard output.
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
248
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
249
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
250 ]]></help>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
251 <citations>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
252 <citation type="doi">10.1093/bioinformatics/15.7.563</citation>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
253 </citations>
f9ab3aa3e538 Uploaded
mvdbeek
parents:
diff changeset
254 </tool>