Mercurial > repos > mvdbeek > patser
comparison patser-v3e.xml @ 0:f9ab3aa3e538 draft
Uploaded
author | mvdbeek |
---|---|
date | Tue, 07 Apr 2015 12:06:36 -0400 |
parents | |
children | 4d9823e0f6f7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f9ab3aa3e538 |
---|---|
1 <tool id="patser-v3e" name="patser" description="finds putative transcription factor binding sites" version="0.1.2"> | |
2 <requirements> | |
3 <requirement type="package" version="v3e">patser</requirement> | |
4 </requirements> | |
5 <stdio> | |
6 <exit_code range="1:" /> | |
7 </stdio> | |
8 | |
9 <command><![CDATA[ | |
10 ## We need to transform the fasta input file into the awkward format in that patser can work on | |
11 ## The fasta header must be followed by the nucleotide sequence encapsulated by backslashes. | |
12 ## We simply add backslashes before and after each fasta header and skip the first line, | |
13 ## and we add a final backslash at the end of the file. | |
14 awk '/>/{print "\\"}1' "$input_fasta"|awk '/>/{print;print "\\";next}1'|tail -n +2 >> special.fa; | |
15 echo "\\" >> special.fa; | |
16 patser-v3e -A a:t "$at" c:g "$gc" -m "$input_matrix" -b "$b" $c -d1 -ls "$ls" -f special.fa "$p" > "$output1" | |
17 ]]></command> | |
18 <inputs> | |
19 <param type="data" name="input_matrix" format="txt" help="Provide alignment matrix file"/> | |
20 <param type="data" name="input_fasta" format="fasta" help="Fasta file with sequence"/> | |
21 <param name="v" type="boolean" label="the matrix is a vertical matrix (default: horizontal matrix)" | |
22 truevalue="-v" falsevalue="" | |
23 help="commandline option -v" /> | |
24 <param name="b" type="integer" label="Correction added to the elements of the alignment matrix" | |
25 value="1" | |
26 help="commandline option -b" /> | |
27 <param name="gc" type="float" label="Enter the GC frequency" | |
28 value="0.25" min="0" max="1" | |
29 help="commandline option -A gc:(value)" /> | |
30 <param name="at" type="float" label="Enter the AT frequency" | |
31 value="0.25" min="0" max="1" | |
32 help="commandline option -A at:" /> | |
33 <param name="c" type="boolean" label="Also score the complementary sequences" | |
34 truevalue="-c" falsevalue="" checked="true" | |
35 help="commandline option -c: Also score the complementary sequences. The complements are determined by the program and are not explicitly stated in the sequence fasta" /> | |
36 <param name="p" type="boolean" label="print the weight matrix derived from the alignment matrix" | |
37 truevalue="-p" falsevalue="" checked="true" | |
38 help="commandline option -p" /> | |
39 <param name="ls" type="float" label="Lower-threshold score, inclusive" | |
40 value="7" | |
41 help="commandline option -ls" /> | |
42 </inputs> | |
43 <outputs> | |
44 <data name="output1" format="txt" from_work_dir="output.txt" /> | |
45 </outputs> | |
46 <tests> | |
47 <test> | |
48 <param name="input_matrix" value="PWM_training_EcR-USP.txt"/> | |
49 <param name="input_fasta" value="EcR_USP_224.fa"/> | |
50 <output name="output1" file="output.txt" lines_diff="6"/> | |
51 </test> | |
52 </tests> | |
53 <help><![CDATA[ | |
54 | |
55 This wrapper has been written by Marius van den Beek (m.vandenbeek at gmail.com). | |
56 Patser is available from http://stormo.wustl.edu/resources.html . | |
57 | |
58 ------------------------------------------------------------------------------- | |
59 | |
60 The following options can be determined on the command line: | |
61 | |
62 :: | |
63 | |
64 0) -h: print these directions. | |
65 | |
66 1) Matrix options. | |
67 -m filename: (default name is "matrix") file containing the matrix. | |
68 -w: the matrix is a weight matrix (default: alignment matrix) | |
69 -b number: a non-negative number indicating the total number of | |
70 pseudo-counts added to each alignment position (default: 1). | |
71 Before converting an alignment matrix to a weight matrix, the | |
72 total pseudo-counts multiplied by the a priori probability | |
73 (see section 3 below) of the corresponding letter is added | |
74 to each matrix element. | |
75 -v: the matrix is a vertical matrix (default: horizontal matrix). | |
76 -p: print the weight matrix derived from the alignment matrix. | |
77 | |
78 2) -f filename: this file (default: read from the standard input) contains | |
79 the names of the sequences. The corresponding sequence may follow | |
80 its name if the sequence is enclosed between backslashes (\). | |
81 Otherwise, the sequence is assumed to be in a separate file having | |
82 the indicated name. | |
83 | |
84 In the sequences, whitespace, slashes (/), periods, dashes (unless | |
85 part of an integer when the "-i" option is used), and comments | |
86 beginning with ';', '%', or '#' are ignored. When using letter | |
87 characters (i.e., with the "-a" or "-A" alphabet option), integers | |
88 are also ignored so that the sequence file can contain positional | |
89 information. When using integer characters (i.e., with the "-i" | |
90 alphabet option) the integers must be separated by whitespace. | |
91 | |
92 A "-c" preceding the name of a sequence file indicates that the | |
93 corresponding sequence is circular. | |
94 | |
95 3) Alphabet options---the three options in this section are mutually | |
96 exclusive (default: "-a alphabet"). The a priori probabilities mentioned | |
97 below are used when converting an alignment matrix to a weight matrix. | |
98 -a filename: file containing the alphabet and normalization information. | |
99 | |
100 Each line contains a letter (a symbol in the alphabet) followed by an | |
101 optional normalization number (default: 1.0). The normalization is | |
102 based on the relative a priori probabilities of the letters. For | |
103 nucleic acids, this might be be the genomic frequency of the bases | |
104 or the frequencies observed in the data used to generate the alignment. | |
105 In nucleic acid alphabets, a letter and its complement appear on the | |
106 same line, separated by a colon (a letter can be its own complement, | |
107 e.g. when using a dimer alphabet). Complementary letters may use the | |
108 same normalization number. Only the standard 26 letters are | |
109 permissible; however, when the "-CS" option is used, the alphabet is | |
110 case sensitive so that a total of 52 different characters are possible. | |
111 | |
112 POSSIBLE LINE FORMATS WITHOUT COMPLEMENTARY LETTERS: | |
113 letter | |
114 letter normalization | |
115 | |
116 POSSIBLE LINE FORMATS WITH COMPLEMENTARY LETTERS: | |
117 letter:complement | |
118 letter:complement normalization | |
119 letter:complement normalization:complement's_normalization | |
120 | |
121 -i filename: same as the "-a" option, except that the symbols of | |
122 the alphabet are represented by integers rather than by letters. | |
123 Any integer permitted by the machine is a permissible symbol. | |
124 | |
125 -A alphabet_and_normalization_information: same as "-a" option, except | |
126 information appears on the command line (e.g., -A a:t 3 c:g 2). | |
127 | |
128 4) Alphabet modifiers indicating whether ascii alphabets are case | |
129 sensitive---the two options in this section are mutually exclusive | |
130 with each other and with the "-i" option (default: ascii alphabets are | |
131 case insensitive). | |
132 -CS: ascii alphabets are case sensitive. | |
133 -CM: ascii alphabets are case insensitive, but mark the location | |
134 of lowercase letters by printing a line containing their locations. | |
135 This option is useful when lowercase letters indicate a functional | |
136 landmark such as a transcriptional start in a DNA sequence. | |
137 | |
138 5) Options for adjusting or restricting which information | |
139 and scores are printed. | |
140 The "-ls", "-li", and "-lp" options are mutually exclusive. | |
141 -c: also score the complementary sequences. The complements are | |
142 determined by the program and are not explicitly stated in the | |
143 sequence files. | |
144 -ls number: lower threshold for printing scores, inclusive | |
145 (formerly the -l option). | |
146 -li: assume that the maximum ln(p-value) for printing scores equals | |
147 the negative of the sample-size adjusted information content; | |
148 indirectly determines the lower threshold for printing scores. | |
149 -lp number: the maximum ln(p-value) for printing scores; indirectly | |
150 determines the lower threshold for printing scores. | |
151 -u number: upper threshold for printing scores, exclusive. | |
152 | |
153 -t: just print the top score for each sequence. | |
154 -t number: print the indicated number of top scores for each sequence. | |
155 -ds: if the "-t number" option is used, print the top scores for each | |
156 sequence in the order of decreasing score (default: order the | |
157 scores according to their position within the sequence). | |
158 -e number: the small difference for considering 2 scores equal | |
159 (default: 0.000001) | |
160 | |
161 -s: print the sequence corresponding to each score that is printed. | |
162 | |
163 6) Options indicating how unrecognized symbols are treated (default: -d1). | |
164 Symbols are letters when option "-a" or "-A" is used; | |
165 symbols are integers when option "-i" is used. | |
166 The following three options are mutually exclusive. | |
167 -d0: treat unrecognized symbols as errors and exit the program. | |
168 -d1: treat unrecognized symbols as discontinuities, but print a warning. | |
169 Treating a symbol as a discontinuity means that any L-mer | |
170 containing the unrecognized symbol will be ignored. | |
171 -d2: treat unrecognized symbols as discontinuities, and print NO warning. | |
172 | |
173 7) Options for adjusting the estimation of p-value. | |
174 If the -R option is set to zero, the p-value is not estimated. | |
175 -R number: the range for approximating a column of the weight matrix with | |
176 integers (default: 10000). This number is the difference | |
177 between the largest and smallest integers used to estimate | |
178 the scores. Higher values increase precision, but will take | |
179 longer to calculate the table of p-values. | |
180 -M number: the minimum score for approximating p-values (default: 0). | |
181 Higher values will increase precision, | |
182 but may miss interesting scores. | |
183 | |
184 | |
185 :: | |
186 | |
187 ---------------------------------------------------------------------- | |
188 | |
189 Copyright 1990, 1994, 1995, 1996, 2000, 2001, 2002 Gerald Z. Hertz | |
190 May be copied for noncommercial purposes. | |
191 | |
192 Author: | |
193 Gerald Z. Hertz | |
194 gzhertz AT alum.mit.edu | |
195 | |
196 PATSER (version 3e) | |
197 | |
198 This program scores the L-mers (subsequences of length L) of the | |
199 indicated sequences against the indicated alignment or weight matrix. | |
200 The elements of an alignment matrix are simply the number of times | |
201 that the indicated letter is observed at the indicated position of a | |
202 sequence alignment. Such elements must be processed before the matrix | |
203 can be used to score an L-mer (e.g., Hertz and Stormo, 1999, | |
204 Bioinformatics, 15:563-577). A weight matrix is a matrix whose | |
205 elements are in a form considered appropriate for scoring an L-mer. | |
206 | |
207 Each element of an alignment matrix is converted to an element of a | |
208 weight matrix by first adding pseudo-counts in proportion to the a | |
209 priori probability of the corresponding letter (see option "-b" in | |
210 section 1 below) and dividing by the total number of sequences plus | |
211 the total number of pseudo-counts. The resulting frequency is | |
212 normalized by the a priori probability for the corresponding letter. | |
213 The final quotient is converted to an element of a weight matrix by | |
214 taking its natural logarithm. The use of pseudo-counts here differs | |
215 from previous versions of this program by being proportional to the a | |
216 priori probability. | |
217 | |
218 Version 3 of this program differs from previous versions by also | |
219 numerically estimating the p-value of the scores. The p-value | |
220 calculated here is the probability of observing a particular score or | |
221 higher at a particular sequence position and does NOT account for the | |
222 total amount of sequence being scored. P-values are estimated by the | |
223 method described in Staden, 1989, CABIOS, p. 89--96. The relative | |
224 value for each element of the weight matrix is approximated by | |
225 integers in a range determined by the "-R" and "-M" options (section 6 | |
226 below). The p-value is calculated for each possible integer score and | |
227 the values are stored. The actual scores for the sequences are | |
228 determined from the true weight matrix. The true scores are converted | |
229 to their corresponding integer values and their p-values are looked up. | |
230 | |
231 Matrices can be either horizontal or vertical. In a horizontal | |
232 matrix, the columns correspond to the positions within the pattern, | |
233 and the rows correspond to the letters. Each row begins with the | |
234 corresponding letter (or integer, if the "-i" option is used). In a | |
235 vertical matrix, the rows correspond to the positions within the | |
236 pattern, and the columns correspond to the letters. The first row | |
237 contains the letters (or integers, if the "-i" option is used) | |
238 corresponding to each column. In both types of matrices, spaces, | |
239 tabs, and vertical bars (|) are ignored. The output of the "consensus" | |
240 and "wconsensus" programs consists of horizontal alignment matrices. | |
241 | |
242 The input files can contain comments according to the following | |
243 convention. The portion of a line following a ';', '%', or '#' is | |
244 considered a comment and is ignored. Comments can begin anywhere in a | |
245 line and always end at the end of the line. The output of this | |
246 program is sent to the standard output. | |
247 | |
248 | |
249 ]]></help> | |
250 <citations> | |
251 <citation type="doi">10.1093/bioinformatics/15.7.563</citation> | |
252 </citations> | |
253 </tool> |