Mercurial > repos > rnateam > package_antarna_1_1
comparison antaRNA.py @ 0:3abbf92e2ed2 draft
Uploaded
author | rnateam |
---|---|
date | Thu, 30 Apr 2015 08:27:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3abbf92e2ed2 |
---|---|
1 import numpy | |
2 import sys | |
3 import random | |
4 import subprocess | |
5 import re | |
6 import decimal | |
7 import math | |
8 import os | |
9 import shutil | |
10 import time | |
11 import types | |
12 import argparse | |
13 #from argparse import RawTextHelpFormatter | |
14 | |
15 ############################# | |
16 # FUNCTIONS | |
17 | |
18 def print2file(f, i, m): | |
19 """ | |
20 print content i to file f in mode m | |
21 """ | |
22 line = str(i) | |
23 if m == "a": | |
24 call = "echo \"" + line + "\" >> " + f | |
25 elif m == "w": | |
26 call = "echo \"" + line + "\" > " + f | |
27 os.system(call) | |
28 | |
29 # checking and correcting the alphabet of the constraint sequence | |
30 def checkSequenceConstraint(SC): | |
31 """ | |
32 Checks the Sequence constraint for illegal nucleotide characters | |
33 """ | |
34 out = "" | |
35 for c in SC: | |
36 c = c.upper() | |
37 if c not in "ACGURYSWKMBDHVN": | |
38 # and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V": | |
39 if c == "T": | |
40 c = "U" | |
41 else: | |
42 print "\tIllegal Character in the constraint sequence!" | |
43 print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!" | |
44 print "\tA Adenine" | |
45 print "\tC Cytosine" | |
46 print "\tG Guanine" | |
47 print "\tT/U Thymine/Uracil" | |
48 print "\tR A or G" | |
49 print "\tY C or T/U" | |
50 print "\tS G or C" | |
51 print "\tW A or T/U" | |
52 print "\tK G or T/U" | |
53 print "\tM A or C" | |
54 print "\tB C or G or T/U" | |
55 print "\tD A or G or T/U" | |
56 print "\tH A or C or T/U" | |
57 print "\tV A or C or G" | |
58 print "\tN any base" | |
59 exit(0) | |
60 out += c | |
61 return (1, out) | |
62 | |
63 | |
64 def transform(seq): | |
65 """ | |
66 Transforms "U" to "T" for the processing is done on DNA alphabet | |
67 """ | |
68 S = "" | |
69 for s in seq: | |
70 if s == "T": | |
71 S += "U" | |
72 else: | |
73 S += s | |
74 return S | |
75 | |
76 | |
77 def checkSimilarLength(s, SC): | |
78 """ | |
79 Compares sequence and structure constraint length | |
80 """ | |
81 if len(s) == len(SC): | |
82 return 1 | |
83 else: | |
84 return 0 | |
85 | |
86 | |
87 def isStructure(s): | |
88 """ | |
89 Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters. | |
90 """ | |
91 returnvalue = 1 | |
92 for a in range(0,len(s)): | |
93 if s[a] not in ".()[]{}<>": | |
94 if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
95 returnvalue = 0 | |
96 return returnvalue | |
97 | |
98 | |
99 def isBalanced(s): | |
100 """ | |
101 Check if the structure s is of a balanced nature | |
102 """ | |
103 | |
104 balance = 1 | |
105 for bracket in ["()", "[]", "{}", "<>"]: | |
106 counter = 0 | |
107 for a in xrange(len(s)): | |
108 if s[a] in bracket[0]: | |
109 counter += 1 | |
110 elif s[a] in bracket[1]: | |
111 counter -= 1 | |
112 if counter != 0: | |
113 balance = 0 | |
114 return balance | |
115 | |
116 | |
117 | |
118 def fulfillsHairpinRule(s): | |
119 """ | |
120 CHECKING FOR THE 3 nt LOOP INTERSPACE | |
121 for all kind of basepairs, even wihtin the pdeudoknots | |
122 """ | |
123 | |
124 fulfillsRules = 1 | |
125 for bracket in ["()", "[]", "{}", "<>"]: | |
126 last_opening_char = 0 | |
127 check = 0 | |
128 for a in xrange(len(s)): | |
129 if s[a] == bracket[0]: | |
130 last_opening_char = a | |
131 check = 1 | |
132 elif s[a] == bracket[1] and check == 1: | |
133 check = 0 | |
134 if a - last_opening_char < 4: | |
135 return 0 | |
136 return 1 | |
137 | |
138 | |
139 def isValidStructure(s): | |
140 """ | |
141 Checks, if the structure s is a valid structure | |
142 """ | |
143 | |
144 Structure = isStructure(s) | |
145 Balanced = isBalanced(s) | |
146 HairpinRule = fulfillsHairpinRule(s) | |
147 | |
148 if Structure == 1 and Balanced == 1 and HairpinRule == 1: | |
149 return 1 | |
150 else: | |
151 print Structure, Balanced, HairpinRule | |
152 return 0 | |
153 | |
154 def loadIUPACcompatibilities(IUPAC, useGU): | |
155 """ | |
156 Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES | |
157 """ | |
158 compatible = {} | |
159 for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE | |
160 sn1 = list(IUPAC[nuc1]) | |
161 for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE | |
162 sn2 = list(IUPAC[nuc2]) | |
163 compatib = 0 | |
164 for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE: | |
165 for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE: | |
166 # CHECKING THEIR COMPATIBILITY | |
167 if useGU == True: | |
168 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"): | |
169 compatib = 1 | |
170 else: | |
171 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"): | |
172 compatib = 1 | |
173 compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL | |
174 return compatible | |
175 | |
176 def isCompatibleToSet(c1, c2, IUPAC_compatibles): | |
177 """ | |
178 Checks compatibility of c1 wihtin c2 | |
179 """ | |
180 compatible = True | |
181 for setmember in c2: | |
182 #print setmember | |
183 if isCompatible(c1, setmember, IUPAC_compatibles) == False: | |
184 return False | |
185 return compatible | |
186 | |
187 | |
188 def isCompatible(c1, c2, IUPAC_compatibles): | |
189 """ | |
190 Checks compatibility between character c1 and c2 | |
191 """ | |
192 if IUPAC_compatibles[c1 + "_" + c2] == 1: | |
193 return True | |
194 else: | |
195 return False | |
196 | |
197 | |
198 def isStructureCompatible(lp1, lp2 ,bp): | |
199 """ | |
200 Checks, if the region within lp1 and lp2 is structurally balanced | |
201 """ | |
202 x = lp1 + 1 | |
203 while (x < lp2): | |
204 if (bp[x] <= lp1 or bp[x] > lp2): | |
205 return False | |
206 if x == bp[x]: | |
207 x += 1 | |
208 else: | |
209 x = bp[x] + 1 | |
210 return x == lp2 | |
211 | |
212 | |
213 def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles): | |
214 """ | |
215 Checks if the constraints are compatible to each other | |
216 """ | |
217 returnstring = "" | |
218 compatible = 1 | |
219 for id1 in basepairstack: # key = (constraint , (pos, constraint))) | |
220 constr1 = basepairstack[id1][0] | |
221 id2 = basepairstack[id1][1][0] | |
222 constr2 = basepairstack[id1][1][1] | |
223 | |
224 if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles): | |
225 | |
226 compatible = 0 | |
227 returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n" | |
228 #if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]): | |
229 | |
230 #compatible = 0 | |
231 #else: | |
232 #returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n" | |
233 return (compatible, returnstring) | |
234 | |
235 | |
236 def getLP(BPSTACK): | |
237 """ | |
238 Retreives valid lonley base pairs from a base pair stack | |
239 """ | |
240 #20 ('N', (>BLOCK<, 'N')) | |
241 | |
242 # geting single base pairs | |
243 stack = {} | |
244 LP = {} | |
245 if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType: | |
246 for i in BPSTACK.keys(): | |
247 #if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
248 stack[i] = int(BPSTACK[i][1][0]) | |
249 #print i , BPSTACK[i][1][0] | |
250 else: | |
251 for i in BPSTACK.keys(): | |
252 #if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
253 stack[i] = BPSTACK[i] | |
254 | |
255 # removing redundant base pair indices | |
256 for i in stack.keys(): | |
257 if i >= stack[i]: | |
258 del stack[i] | |
259 | |
260 # actual checking for single lonley base pairs | |
261 for i in stack.keys(): | |
262 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1): | |
263 LP[i] = stack[i] | |
264 | |
265 ##actual removal of 2er lonley base pairs | |
266 for i in stack.keys(): | |
267 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2): | |
268 LP[i] = stack[i] | |
269 LP[i+1] = stack[i+1] | |
270 | |
271 | |
272 #if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType: | |
273 #for i in BPSTACK.keys(): | |
274 | |
275 ##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
276 #stack[i] = int(BPSTACK[i][1][0]) | |
277 ##print i , BPSTACK[i][1][0] | |
278 #else: | |
279 #for i in BPSTACK.keys(): | |
280 ##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
281 #stack[i] = BPSTACK[i] | |
282 | |
283 #for i in stack.keys(): | |
284 #if i >= stack[i]: | |
285 #del stack[i] | |
286 | |
287 | |
288 | |
289 return LP | |
290 | |
291 | |
292 def getBPStack(s, seq): | |
293 """ | |
294 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq. | |
295 """ | |
296 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]} | |
297 bpstack = {} | |
298 for i in xrange(len(s)): | |
299 | |
300 # REGULAR SECONDARY STRUCTURE DETECTION | |
301 if s[i] in "(){}[]<>": | |
302 | |
303 no = 0 | |
304 ### opening | |
305 if s[i] in "([{<": | |
306 if s[i] == "(": | |
307 tmp_stack["()"].append((i, seq[i])) | |
308 elif s[i] == "[": | |
309 tmp_stack["[]"].append((i, seq[i])) | |
310 elif s[i] == "{": | |
311 tmp_stack["{}"].append((i, seq[i])) | |
312 elif s[i] == "<": | |
313 tmp_stack["<>"].append((i, seq[i])) | |
314 | |
315 #closing | |
316 elif s[i] in ")]}>": | |
317 if s[i] == ")": | |
318 no, constr = tmp_stack["()"].pop() | |
319 elif s[i] == "]": | |
320 no, constr = tmp_stack["[]"].pop() | |
321 elif s[i] == "}": | |
322 no, constr = tmp_stack["{}"].pop() | |
323 elif s[i] == ">": | |
324 no, constr = tmp_stack["<>"].pop() | |
325 bpstack[no] = (constr, (i, seq[i])) | |
326 bpstack[i] = (seq[i] ,(no, constr)) | |
327 | |
328 elif s[i] == ".": | |
329 bpstack[i] = (seq[i], (i, seq[i])) | |
330 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
331 bpstack[i] = (seq[i], (i, seq[i])) | |
332 | |
333 return (bpstack, getLP(bpstack)) | |
334 | |
335 | |
336 def getbpStack(s): | |
337 """ | |
338 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq. | |
339 """ | |
340 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]} | |
341 bpstack = {} | |
342 | |
343 for i in xrange(len(s)): | |
344 if s[i] in "(){}[]<>": | |
345 | |
346 no = 0 | |
347 ### opening | |
348 if s[i] in "([{<": | |
349 if s[i] == "(": | |
350 tmp_stack["()"].append(i) | |
351 elif s[i] == "[": | |
352 tmp_stack["[]"].append(i) | |
353 elif s[i] == "{": | |
354 tmp_stack["{}"].append(i) | |
355 elif s[i] == "<": | |
356 tmp_stack["<>"].append(i) | |
357 | |
358 #closing | |
359 elif s[i] in ")]}>": | |
360 if s[i] == ")": | |
361 no = tmp_stack["()"].pop() | |
362 elif s[i] == "]": | |
363 no = tmp_stack["[]"].pop() | |
364 elif s[i] == "}": | |
365 no = tmp_stack["{}"].pop() | |
366 elif s[i] == ">": | |
367 no = tmp_stack["<>"].pop() | |
368 bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))} | |
369 bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))} | |
370 | |
371 elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner.... | |
372 bpstack[i] = i | |
373 | |
374 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
375 bpstack[i] = i | |
376 | |
377 #elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
378 ## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific | |
379 ## block character | |
380 #bpstack[i] = s[i] | |
381 | |
382 return (bpstack, getLP(bpstack)) | |
383 | |
384 def maprange( a, b, s): | |
385 """ | |
386 Mapping function | |
387 """ | |
388 (a1, a2), (b1, b2) = a, b | |
389 return b1 + ((s - a1) * (b2 - b1) / (a2 - a1)) | |
390 | |
391 | |
392 def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt): | |
393 """ | |
394 GC path length contribution calculation. | |
395 """ | |
396 GCadjustment = 1.5 | |
397 minimum = 0.5 | |
398 upper = GCadjustment | |
399 lower = minimum | |
400 | |
401 if nt == "A" or nt == "U": | |
402 pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC) | |
403 | |
404 if nt == "G" or nt == "C": | |
405 #pathlength = pathlength * (float(1-tmpGC)) | |
406 pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC) | |
407 return pathlength | |
408 | |
409 def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements): | |
410 """ | |
411 Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section) | |
412 """ | |
413 # TE :: transition element / path section under dispute | |
414 # id1 :: id of the position of the caharacter to which the transition is leading to | |
415 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present | |
416 # val :: BPstack information of the specific position | |
417 # constr1 :: constraining character of pos id1 | |
418 # constr2 :: constraining character of pos id2 | |
419 | |
420 id1 = int(TE.split(".")[0]) | |
421 val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack | |
422 constr1 = val[0] # getting the constraint character of position id1 | |
423 id2 = int(val[1][0]) # getting position id2 | |
424 constr2 = val[1][1] # getting the sequence constraint for position id2 | |
425 targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to | |
426 | |
427 c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1 | |
428 c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2 | |
429 | |
430 if targetNucleotide in c1: | |
431 if id1 == id2: | |
432 return 1 | |
433 else: | |
434 if targetNucleotide in c2: | |
435 return 1 | |
436 else: | |
437 return 0 | |
438 else: | |
439 return 0 | |
440 | |
441 """ | |
442 def getConstraint(TE, BPstack): | |
443 # TE :: transition element / path section | |
444 # id1 :: id of the position of the caharacter to which the transition is leading to | |
445 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present | |
446 # val :: BPstack information of the specific position | |
447 # constr1 :: constraining character of pos id1 | |
448 # constr2 :: constraining character of pos id2 | |
449 | |
450 ### BPstack [id1] = (constr1, (id2, constr2)) | |
451 | |
452 id1 = TE.split(".")[0] | |
453 #print id1 | |
454 #id1 = TE.find(TE.strip("_")) # strip the path section and getting the position of the section | |
455 #if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition | |
456 #id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character | |
457 val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack | |
458 constr1 = val[0] # getting the constraint character of position id1 | |
459 id2 = val[1][0] # getting position id2 | |
460 constr2 = val[1][1] # getting the sequence constraint for position id2 | |
461 #print TE, id1, constr1, id2, constr2, | |
462 | |
463 #TE.split(".")[1][-1:] | |
464 if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint | |
465 if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1 | |
466 #print 1 | |
467 return 1 | |
468 elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint | |
469 #print 1 | |
470 return 1 | |
471 else: # single base sequence constraints differ | |
472 #print 0 | |
473 return 0 | |
474 | |
475 elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint) | |
476 if constr1 == "N" and constr2 == "N": # no sequence constraint | |
477 #print 1 | |
478 return 1 | |
479 if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases) | |
480 if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2 | |
481 #print 1 | |
482 return 1 | |
483 else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2 | |
484 #print 0 | |
485 return 0 | |
486 if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base) | |
487 if TE.split(".")[1][-1:] == constr1: # the current path section destination base is as constrained with constr1 | |
488 #print 1 | |
489 return 1 | |
490 else: # the current path section destination base is not as constrained in constr1 | |
491 #print 0 | |
492 return 0 | |
493 if constr1 != "N" and constr2 != "N": # both positions have sequential constraint | |
494 if TE.split(".")[1][-1:] == constr1: | |
495 #print 1 | |
496 return 1 | |
497 else: | |
498 #print 0 | |
499 return 0 | |
500 """ | |
501 | |
502 def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements): | |
503 #nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3} | |
504 | |
505 dels = [] | |
506 for terrainelement in sorted(terrain): | |
507 pheromone, pathlength = terrain[terrainelement] | |
508 pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
509 pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
510 pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:]) | |
511 if pheromone * pathlength == 0: dels.append(terrainelement) | |
512 terrain[terrainelement] = (pheromone, pathlength,[]) | |
513 further_dels = {} | |
514 for terrainelement in sorted(dels): | |
515 pos, nucs = terrainelement.split(".") | |
516 if int(pos) < len(s)-1: | |
517 to_nt = nucs[-1:] | |
518 successor_pos = int(pos) + 1 | |
519 for i in ["A", "C", "G", "U"]: | |
520 del_element = str(successor_pos) + "." + to_nt + i | |
521 further_dels[del_element] = 1 | |
522 further_dels[terrainelement] = 1 | |
523 # deleting the inbound and outbound edges, which are forbidden | |
524 for terrainelement in further_dels: | |
525 del terrain[terrainelement] | |
526 # allocate the appropriate children of edges | |
527 for terrainelement in terrain: | |
528 pheromone, pathlength, children = terrain[terrainelement] | |
529 pos, nucs = terrainelement.split(".") | |
530 if int(pos) < len(s): | |
531 to_nt = nucs[-1:] | |
532 successor_pos = int(pos) + 1 | |
533 for i in ["A", "C", "G", "U"]: | |
534 if str(successor_pos) + "." + to_nt + i in terrain: | |
535 children.append(str(successor_pos) + "." + to_nt + i) | |
536 terrain[terrainelement] = (pheromone, pathlength,children) | |
537 starts = [] | |
538 for i in ["A", "C", "G", "U"]: | |
539 if str(0) + "." + i in terrain: | |
540 starts.append(str(0) + "." + i) | |
541 terrain["00.XY"] = (1, 1, starts) | |
542 return (terrain, BPstack) | |
543 | |
544 | |
545 def initTerrain(s): # THE CLASSIC | |
546 """ | |
547 Initialization of the terrain with graph like terrain... vertices are modeled implicitly | |
548 """ | |
549 nt = ["A","C","G","U"] | |
550 nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides | |
551 e = {} | |
552 pathlength = 1 | |
553 pheromone = 1 | |
554 for p in xrange(len(s)): | |
555 if p == 0: | |
556 for i in nt: | |
557 e["%s.%s"%(p,i)] = (pheromone, pathlength) | |
558 elif p > 0: | |
559 for n in nt2: | |
560 e["%s.%s"%(p,n)] = (pheromone, pathlength) | |
561 return e | |
562 | |
563 | |
564 | |
565 def complementBase(c): | |
566 """ | |
567 Returns the complement RNA character of c (without GU base pairs) | |
568 """ | |
569 retChar = "" | |
570 if c == "A" : | |
571 retChar = "U" | |
572 elif c == "U": | |
573 retChar = "A" | |
574 elif c == "C": | |
575 retChar = "G" | |
576 elif c == "G": | |
577 retChar = "C" | |
578 return retChar | |
579 | |
580 def printTerrain(terrain): | |
581 #print sorted(terrain.keys()) | |
582 tmp_i = "0" | |
583 tmp_c = 0 | |
584 terrain = terrain[0] | |
585 | |
586 for a, i in enumerate(sorted(terrain.keys())): | |
587 #print a | |
588 if i.split(".")[0] != tmp_i: | |
589 print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i] | |
590 | |
591 tmp_c = 1 | |
592 tmp_i = i.split(".")[0] | |
593 else: | |
594 print i, terrain[i] | |
595 tmp_c += 1 | |
596 | |
597 print "\nElements:", tmp_c | |
598 print "#########################" | |
599 print len(terrain) | |
600 | |
601 def pickStep(tmp_steps, summe): | |
602 """ | |
603 Selects a step within the terrain | |
604 """ | |
605 if len(tmp_steps) == 1: | |
606 return tmp_steps[0][1] # returning the nucleotide of the only present step | |
607 else: | |
608 rand = random.random() # draw random number | |
609 mainval = 0 | |
610 for choice in xrange(len(tmp_steps)): | |
611 val, label = tmp_steps[choice] | |
612 mainval += val/float(summe) | |
613 if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done | |
614 return label | |
615 | |
616 def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements): | |
617 """ | |
618 Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity | |
619 of the base pairs GU, GC and AT | |
620 """ | |
621 nt = ["A","C","G","U"] | |
622 prev_edge = "00.XY" | |
623 sequence = "" | |
624 while len(sequence) < len(s): | |
625 coming_from = sequence[-1:] | |
626 summe = 0 | |
627 steps = [] | |
628 i = len(sequence) | |
629 allowed_nt = "ACGU" | |
630 # base pair closing case check, with subsequent delivery of a reduced allowed nt set | |
631 | |
632 if i > tmp_BPstack[i][1][0]: | |
633 jump = tmp_BPstack[i][1][0] | |
634 nuc_at_jump = sequence[jump] | |
635 allowed_nt = IUPAC_reverseComplements[nuc_at_jump] | |
636 | |
637 #allowed_nt = complementBase(nuc_at_jump) | |
638 | |
639 # Checking for every possible nt if it is suitable for the selection procedure | |
640 for edge in tmp_terrain[prev_edge][-1]: | |
641 | |
642 if edge[-1:] in allowed_nt: | |
643 pheromone, PL , children = tmp_terrain[edge] | |
644 #if PL > 0: | |
645 value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta)) | |
646 summe += value | |
647 steps.append((value, edge)) | |
648 prev_edge = pickStep(steps, summe) | |
649 sequence += prev_edge[-1:] | |
650 | |
651 return sequence | |
652 | |
653 | |
654 ### | |
655 # STRUCTURE PREDICTORS | |
656 ### | |
657 def getPKStructure(sequence, temperature, mode = "A"): | |
658 """ | |
659 Initialization pKiss mfe pseudoknot prediction | |
660 """ | |
661 p2p = "pKiss" | |
662 #p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe" | |
663 strategy = "--strategy " | |
664 t = "--temperature " + str(temperature) | |
665 | |
666 if mode == "A": strategy += "A" | |
667 elif mode == "B": strategy += "B" | |
668 elif mode == "C": strategy += "C" | |
669 elif mode == "D": strategy += "D" | |
670 elif mode == "P": strategy += "P" | |
671 | |
672 p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]), | |
673 #shell = True, | |
674 stdin = subprocess.PIPE, | |
675 stdout = subprocess.PIPE, | |
676 stderr = subprocess.PIPE, | |
677 close_fds = True) | |
678 #print p.stderr.readline() | |
679 | |
680 p.stdin.write(sequence+'\n') | |
681 pks = p.communicate() | |
682 structure = "".join(pks[0].split("\n")[2].split(" ")[-1:]) | |
683 return structure | |
684 | |
685 def init_RNAfold(temperature, paramFile = ""): | |
686 """ | |
687 Initialization RNAfold listener | |
688 """ | |
689 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold" | |
690 p2p = "RNAfold" | |
691 | |
692 t = "-T " + str(temperature) | |
693 P = "" | |
694 if paramFile != "": | |
695 P = "-P " + paramFile | |
696 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]), | |
697 #shell = True, | |
698 stdin = subprocess.PIPE, | |
699 stdout = subprocess.PIPE, | |
700 stderr = subprocess.PIPE, | |
701 close_fds = True) | |
702 return p | |
703 | |
704 | |
705 def consult_RNAfold(seq, p): | |
706 """ | |
707 Consults RNAfold listener | |
708 """ | |
709 p.stdin.write(seq+'\n') | |
710 out = "" | |
711 for i in xrange(2): | |
712 out += p.stdout.readline() | |
713 return out | |
714 | |
715 | |
716 def getRNAfoldStructure(struct2, process1): | |
717 """ | |
718 Retrieves folded structure of a RNAfold call | |
719 """ | |
720 | |
721 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+') | |
722 #RNAdist_pattern = re.compile('.*\s([\d]+)') | |
723 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1)) | |
724 current_structure = "" | |
725 #if RNAfold_match: | |
726 return RNAfold_match.group(1) | |
727 | |
728 | |
729 def init_RNAdistance(): | |
730 """ | |
731 Initialization of RNAdistance listener | |
732 """ | |
733 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance" | |
734 p2p = "RNAdistance" | |
735 p = subprocess.Popen( ([p2p]), | |
736 #shell = True, | |
737 stdin = subprocess.PIPE, | |
738 stdout = subprocess.PIPE, | |
739 stderr = subprocess.PIPE, | |
740 close_fds = True) | |
741 return p | |
742 | |
743 | |
744 def consult_RNAdistance(s1, s2, p): | |
745 """ | |
746 Consulting the RNAdistance listener | |
747 """ | |
748 p.stdin.write(s1+'\n') | |
749 p.stdin.write(s2+'\n') | |
750 out = "" | |
751 out_tmp = p.stdout.readline().strip() | |
752 if out_tmp != "": | |
753 out += out_tmp | |
754 return out | |
755 | |
756 def getInducingSequencePositions(Cseq, degreeOfSequenceInducement): | |
757 """ | |
758 Delimiting the degree of structure inducement by the supplied sequence constraint. | |
759 0 : no sequence induced structure constraint | |
760 1 : "ACGT" induce structure (explicit nucleotide structure inducement level) | |
761 2 : "MWKSYR" and "ACGT" (explicit and double instances) | |
762 3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances) | |
763 """ | |
764 setOfNucleotides = "" # resembling the "0"-case | |
765 if degreeOfSequenceInducement == 1: | |
766 setOfNucleotides = "ACGU" | |
767 elif degreeOfSequenceInducement == 2: | |
768 setOfNucleotides = "ACGUMWKSYR" | |
769 elif degreeOfSequenceInducement == 3: | |
770 setOfNucleotides = "ACGUMWKSYRBDHV" | |
771 #elif degreeOfSequenceInducement == 4: | |
772 #setOfNucleotides = "ACGTMWKSYRBDHVN" | |
773 | |
774 tmpSeq = "" | |
775 listset = setOfNucleotides | |
776 for pos in Cseq: | |
777 if pos not in listset: | |
778 tmpSeq += "N" | |
779 else: | |
780 tmpSeq += pos | |
781 | |
782 return setOfNucleotides, tmpSeq | |
783 | |
784 | |
785 def getBPDifferenceDistance(stack1, stack2): | |
786 """ | |
787 Based on the not identical amount of base pairs within both structure stacks | |
788 """ | |
789 d = 0 | |
790 for i in stack1.keys(): | |
791 # check base pairs in stack 1 | |
792 if i < stack1[i] and stack1[i] != stack2[i]: | |
793 d += 1 | |
794 # check base pairs in stack 2 | |
795 for i in stack2.keys(): | |
796 if i < stack2[i] and stack1[i] != stack2[i]: | |
797 d += 1 | |
798 return d | |
799 | |
800 | |
801 def getStructuralDistance(target_structure, Cseq, path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy): | |
802 """ | |
803 Calculator for Structural Distance | |
804 """ | |
805 # fold the current solution's sequence to obtain the structure | |
806 | |
807 current_structure = "" | |
808 | |
809 if pseudoknots: | |
810 current_structure = getPKStructure(path,strategy) | |
811 else: | |
812 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold)) | |
813 current_structure = RNAfold_match.group(1) | |
814 | |
815 # generate the current structure's base-pair stack | |
816 bp = getbpStack(current_structure)[0] | |
817 # add case-dependend structural constraints in case of lonley basepairs formation | |
818 tmp_target_structure_bp = getbpStack(target_structure)[0] | |
819 | |
820 for lp in LP: | |
821 if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure. | |
822 #tmp_target_structure[lp] = "(" | |
823 #tmp_target_structure[LP[lp]] = ")" | |
824 tmp_target_structure_bp[lp] = LP[lp] | |
825 tmp_target_structure_bp[LP[lp]] = lp | |
826 | |
827 # REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure | |
828 # check for all allowed implicit constraint block declarators | |
829 for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
830 occurances = [] | |
831 for m in re.finditer(c, target_structure): # search for a declarator in the requested structure | |
832 occurances.append(m.start()) # save the corresponding index | |
833 | |
834 # transform declarator into single stranded request | |
835 for i in occurances: | |
836 #tmp_target_structure[i] = "." | |
837 tmp_target_structure_bp[i] = i | |
838 # infer a base pair within the block declarated positions, if the current structure provides it. | |
839 for i in occurances: | |
840 for j in occurances: | |
841 if i < j: | |
842 if bp[i] == j: | |
843 #tmp_target_structure[i] = "(" | |
844 #tmp_target_structure[bp[i]] = ")" | |
845 | |
846 tmp_target_structure_bp[i] = bp[i] | |
847 tmp_target_structure_bp[bp[i]] = i | |
848 | |
849 # CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION | |
850 #print "Checking Cseq influence and it's induced basepairs..." | |
851 IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement) | |
852 if len(Cseq.strip("N")) > 0: | |
853 #print "Processing Cseq influence" | |
854 # Iterate over all positions within the Base Pair stack | |
855 for i in BP: # Check for each base index i | |
856 | |
857 if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution | |
858 | |
859 bp_j = bp[i] # Actual j index of the current solution | |
860 BP_j = BP[i][1][0] # j index of the requested structure | |
861 if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce... | |
862 if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs | |
863 #if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible | |
864 #tmp_target_structure[i] = "(" | |
865 #tmp_target_structure[bp_j] = ")" | |
866 | |
867 tmp_target_structure_bp[i] = bp[i] | |
868 tmp_target_structure_bp[bp_j] = i | |
869 | |
870 #tts = "".join(tmp_target_structure) | |
871 dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp) | |
872 | |
873 # CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed | |
874 failLP = 0 | |
875 for lp in LP: | |
876 | |
877 if bp[lp] != LP[lp]: | |
878 | |
879 isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles) | |
880 isStru = isStructureCompatible(lp, LP[lp] ,bp) | |
881 if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the | |
882 # basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair | |
883 failLP += 1 | |
884 | |
885 #print dsreg, failLP, float(len(tmp_target_structure_bp)) | |
886 dsLP = float(failLP) | |
887 | |
888 return (dsreg + dsLP) /float(len(tmp_target_structure_bp)) | |
889 | |
890 | |
891 def getGC(sequence): | |
892 """ | |
893 Calculate GC content of a sequence | |
894 """ | |
895 GC = 0 | |
896 for nt in sequence: | |
897 if nt == "G" or nt == "C": | |
898 GC = GC + 1 | |
899 GC = GC/float(len(sequence)) | |
900 return GC | |
901 | |
902 | |
903 def getGCDistance(tGC, gc2, L): | |
904 """ | |
905 Calculate the pseudo GC content distance | |
906 """ | |
907 nt_coeff = L * tGC | |
908 pc_nt = (1/float(L))*100 | |
909 # | |
910 d = gc2 - tGC | |
911 d = d * 100 | |
912 | |
913 f = math.floor(nt_coeff) | |
914 c = math.ceil(nt_coeff) | |
915 | |
916 if d < 0: # | |
917 #print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt, | |
918 d = d + (abs(nt_coeff - f)) * pc_nt | |
919 elif d > 0: # case y | |
920 #print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt, | |
921 d = d - abs(nt_coeff - c) * pc_nt | |
922 elif d == 0: | |
923 pass | |
924 | |
925 d = round(d, 7) | |
926 #d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 ) | |
927 return abs(d) | |
928 | |
929 | |
930 def getSequenceEditDistance(SC, path): | |
931 """ | |
932 Calculate sequence edit distance of a solution to the constraint | |
933 """# | |
934 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"} | |
935 edit = 0 | |
936 for i in xrange(len(SC)): | |
937 if path[i] not in IUPAC[SC[i]]: | |
938 edit += 1 | |
939 return edit/float(len(path)) | |
940 | |
941 | |
942 | |
943 def getTransitions(p): | |
944 """ | |
945 Retreive transitions of a specific path/sequence | |
946 """ | |
947 transitions = [] | |
948 for pos in xrange(len(p)): | |
949 if pos == 0: | |
950 transitions.append(str(pos) + "." + p[pos]) | |
951 | |
952 else: | |
953 insert = p[pos-1] + p[pos] | |
954 transitions.append(str(pos) + "." + insert) | |
955 | |
956 return transitions | |
957 | |
958 | |
959 def evaporate(t, er): | |
960 """ | |
961 Evaporate the terrain's pheromone trails | |
962 """ | |
963 terr, BP = t | |
964 c = 1 | |
965 for key in terr: | |
966 p,l,c = terr[key] | |
967 p *= (1-er) | |
968 terr[key] = (p, l, c) | |
969 | |
970 | |
971 def updateValue(distance, correction_term, omega): | |
972 """ | |
973 Retrieves a distance dependend pheromone value | |
974 """ | |
975 if correction_term == 0: | |
976 return 0 | |
977 else: | |
978 if distance == 0: | |
979 return omega * correction_term | |
980 else: | |
981 return (1/float(distance)) * correction_term | |
982 | |
983 | |
984 def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose): | |
985 """ | |
986 Pheromone Update function accorinding to the quality of the solution | |
987 """ | |
988 terr, BP = t | |
989 bpstack, LP = getbpStack(c_s) | |
990 | |
991 struct_correction_term , GC_correction_term, seq_correction_term = correction_terms | |
992 omega = 2.23 | |
993 | |
994 bs = updateValue(ds, struct_correction_term, omega) | |
995 bGC = updateValue(dgc, GC_correction_term, omega) | |
996 if dseq != "n.a.": | |
997 bSeq = updateValue(dseq, seq_correction_term, omega) | |
998 d = bs + bGC + bSeq | |
999 else: | |
1000 d = bs + bGC | |
1001 transitions = getTransitions(p) | |
1002 | |
1003 for trans in xrange(len(transitions)): # for each transition in the path | |
1004 id1 = int(transitions[trans].split(".")[0]) | |
1005 tar_id2 = int(BPstack[id1][1][0]) # getting requested position id2 | |
1006 curr_id2 = int(bpstack[id1]) # getting the current situation | |
1007 multiplicator = 0 | |
1008 if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position | |
1009 multiplicator = 1 | |
1010 elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures | |
1011 multiplicator = 1 | |
1012 p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition | |
1013 p += d * multiplicator | |
1014 terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's | |
1015 t = (terr, BP) | |
1016 | |
1017 | |
1018 def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count): | |
1019 """ | |
1020 General updating function | |
1021 """ | |
1022 evaporate(t,er) | |
1023 trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose) | |
1024 | |
1025 | |
1026 def getUsedTime(start_time): | |
1027 """ | |
1028 Return the used time between -start time- and now. | |
1029 """ | |
1030 end_time = time.time() | |
1031 return end_time - start_time | |
1032 | |
1033 | |
1034 def good2Go(SC, L, CC, STR): | |
1035 """ | |
1036 Check, if all input is correct and runnable | |
1037 """ | |
1038 if (SC == 1 and L == 1 and CC == 1 and STR == 1): | |
1039 return True | |
1040 else: | |
1041 print SC,L,CC,STR | |
1042 return False | |
1043 | |
1044 | |
1045 def getPathFromSelection( aps, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy): | |
1046 """ | |
1047 Returns the winning path from a selection of pathes... | |
1048 """ | |
1049 terr, BPs = terrain | |
1050 win_path = 0 | |
1051 for i in xrange(aps): | |
1052 # Generate Sequence | |
1053 path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements) | |
1054 # Measure sequence features and transform them into singular distances | |
1055 distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy)) | |
1056 distance_GC = float(getGCDistance(GC,getGC(path), len(path))) | |
1057 distance_seq = float(getSequenceEditDistance(SC, path)) | |
1058 # Calculate Distance Score | |
1059 D = distance_structural + distance_GC + distance_seq | |
1060 | |
1061 # SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score | |
1062 if i == 0: | |
1063 win_path = (path, D, distance_structural, distance_GC, distance_seq) | |
1064 else: | |
1065 if D < win_path[1]: | |
1066 win_path = (path, D, distance_structural, distance_GC, distance_seq) | |
1067 return win_path | |
1068 | |
1069 | |
1070 def substr(x, string, subst): | |
1071 """ | |
1072 Classical substring function | |
1073 """ | |
1074 s1 = string[:x-1] | |
1075 | |
1076 s2 = string[x-1:x] | |
1077 s3 = string[x:] | |
1078 #s2 = s[x+len(string)-x-1:] | |
1079 | |
1080 return s1 + subst + s3 | |
1081 | |
1082 | |
1083 def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc): | |
1084 """ | |
1085 Check if a solutions qualities are within the convergence corridor | |
1086 """ | |
1087 struct_var = ((BS_d_struct/float(4)) + 3 ) * 4 | |
1088 gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1 | |
1089 | |
1090 if d_struct <= struct_var and d_gc <= gc_var: | |
1091 return True | |
1092 else: | |
1093 return False | |
1094 | |
1095 def getGCSamplingValue(GC, tGCmax, tGCvar): | |
1096 """ | |
1097 Returns a suitable GC value, dependend on the user input: Either returning the single GC value, | |
1098 which the user entered, or a smpled GC value | |
1099 from a designated distribution in it's interavals | |
1100 """ | |
1101 returnval = 0 | |
1102 if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested | |
1103 return GC | |
1104 elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling | |
1105 if GC < tGCmax: | |
1106 tmp_GC = tGCmax | |
1107 tGCmax = GC | |
1108 GC = tmp_GC | |
1109 while returnval <= 0: | |
1110 returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1)) | |
1111 return returnval | |
1112 elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling | |
1113 while returnval <= 0: | |
1114 returnval = float(numpy.random.normal(GC, tGCvar, 1)) | |
1115 return returnval | |
1116 | |
1117 | |
1118 def reachableGC(C_struct): | |
1119 """ | |
1120 Checks if a demanded GC target content is reachable in dependence with the given sequence constraint. | |
1121 """ | |
1122 AU = 0 | |
1123 for i in C_struct: | |
1124 if i == "A" or i == "U": | |
1125 AU += 1 | |
1126 maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC | |
1127 return maxGC | |
1128 | |
1129 | |
1130 def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy): | |
1131 """ | |
1132 Execution function of a single ant colony finding one solution sequence | |
1133 """ | |
1134 retString = "" | |
1135 retString2 = "" | |
1136 BPstack, LP = getBPStack(s, SC) | |
1137 | |
1138 rGC = reachableGC(SC) | |
1139 GC_message = "" | |
1140 if GC > rGC: | |
1141 print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC) | |
1142 GC = rGC | |
1143 | |
1144 # Initial Constraint Checks prior to execution | |
1145 STR = isValidStructure(s) | |
1146 START_SC , SC = checkSequenceConstraint(str(SC)) | |
1147 START_LENGTH = checkSimilarLength(str(s), str(SC)) | |
1148 START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles) | |
1149 | |
1150 g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR) | |
1151 if (g2g == 1): | |
1152 start_time = time.time() | |
1153 max_time = 600 # seconds | |
1154 | |
1155 | |
1156 | |
1157 | |
1158 #### | |
1159 # INITIALIZATION OF THE RNA TOOLs | |
1160 # | |
1161 RNAfold = init_RNAfold(temperature, paramFile) | |
1162 #RNAdistance = init_RNAdistance() | |
1163 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+') | |
1164 #RNAdist_pattern = re.compile('.*\s([\d]+)') | |
1165 # | |
1166 #### | |
1167 | |
1168 terrain = initTerrain(s) | |
1169 #print len(terrain), | |
1170 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
1171 #print len(terrain[0]) | |
1172 #printTerrain(terrain) | |
1173 #exit(0) | |
1174 global_ant_count = 0 | |
1175 global_best_ants = 0 | |
1176 criterion = False | |
1177 met = True | |
1178 ant_no = 1 | |
1179 prev_res = 0 | |
1180 seq = "" | |
1181 | |
1182 counter = 0 | |
1183 | |
1184 dstruct_log = [] | |
1185 dGC_log = [] | |
1186 | |
1187 | |
1188 distance_structural = 1000 | |
1189 distance_GC = 1000 | |
1190 distance_seq = 1000 | |
1191 | |
1192 convergence = convergence_count | |
1193 convergence_counter = 0 | |
1194 | |
1195 resets = 0 | |
1196 | |
1197 path = "" | |
1198 curr_structure = "" | |
1199 | |
1200 Dscore = 100000 | |
1201 distance_structural = 10000 | |
1202 distance_GC = 10000 | |
1203 distance_seq = 10000 | |
1204 best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1205 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1206 | |
1207 best_solution_since = 0 | |
1208 | |
1209 ants_per_selection = 10 | |
1210 if len(LP) > 0 : | |
1211 for lp in LP: | |
1212 s = substr(lp + 1, s, ".") | |
1213 s = substr(LP[lp] + 1, s, ".") | |
1214 | |
1215 init = 1 | |
1216 while criterion != met and getUsedTime(start_time) < max_time: | |
1217 iteration_start = time.time() | |
1218 global_ant_count += 1 | |
1219 global_best_ants += 1 | |
1220 | |
1221 path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy) | |
1222 | |
1223 distance_structural_prev = distance_structural | |
1224 distance_GC_prev = distance_GC | |
1225 distance_seq_prev = distance_seq | |
1226 | |
1227 path, Dscore , distance_structural, distance_GC, distance_seq = path_info | |
1228 curr_structure = "" | |
1229 if pseudoknots: | |
1230 curr_structure = getPKStructure(path, strategy) | |
1231 else: | |
1232 curr_structure = getRNAfoldStructure(path, RNAfold) | |
1233 | |
1234 curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1235 # BEST SOLUTION PICKING | |
1236 if improve == "h": # hierarchical check | |
1237 # for the global best solution | |
1238 if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]): | |
1239 best_solution = curr_solution | |
1240 ant_no = 1 | |
1241 # for the local (reset) best solution | |
1242 if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]): | |
1243 best_solution_local = curr_solution | |
1244 | |
1245 elif improve == "s": #score based check | |
1246 # store best global solution | |
1247 if Dscore < best_solution[2]: | |
1248 best_solution = curr_solution | |
1249 ant_no = 1 | |
1250 # store best local solution for this reset | |
1251 if Dscore < best_solution_local[2]: | |
1252 best_solution_local = curr_solution | |
1253 | |
1254 # OLD ' BEST SOLUTION ' PICKING | |
1255 # if Dscore < best_solution[2]: | |
1256 # best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1257 # | |
1258 # if Dscore < best_solution_local[2]: | |
1259 # best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1260 | |
1261 | |
1262 distance_DN = 0 | |
1263 | |
1264 if verbose: | |
1265 print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection) + " cc " + str(convergence_counter) | |
1266 | |
1267 print s, " <- target struct" | |
1268 print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0]) | |
1269 print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats" | |
1270 print curr_structure, " <- CS" | |
1271 print path, | |
1272 print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq | |
1273 | |
1274 #### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION | |
1275 updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count) | |
1276 #### | |
1277 if verbose: print "Used time for one iteration", time.time() - iteration_start | |
1278 | |
1279 | |
1280 # CONVERGENCE AND TERMINATION CRITERION MANAGEMENT | |
1281 #print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4] | |
1282 if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]): | |
1283 convergence_counter += 1 | |
1284 if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC: | |
1285 convergence_counter += 1 | |
1286 | |
1287 if best_solution[3] == objective_to_target_distance: | |
1288 if best_solution[4] == 0.0: | |
1289 break | |
1290 ant_no = ant_no + 1 | |
1291 convergence_counter -= 1 | |
1292 else: | |
1293 ant_no = 1 | |
1294 | |
1295 | |
1296 if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5: | |
1297 break | |
1298 | |
1299 # RESET | |
1300 if ant_no < termination_convergence and convergence_counter >= convergence: | |
1301 | |
1302 terrain = initTerrain(s) | |
1303 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
1304 criterion = False | |
1305 met = True | |
1306 ant_no = 1 | |
1307 prev_res = 0 | |
1308 pre_path = "_" * len(s) | |
1309 path = "" | |
1310 curr_structure = "" | |
1311 counter = 0 | |
1312 Dscore = 100000 | |
1313 distance_structural = 1000 | |
1314 distance_GC = 1000 | |
1315 distance_seq = 1000 | |
1316 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
1317 convergence = convergence_count | |
1318 convergence_counter = 0 | |
1319 | |
1320 if resets == 0: | |
1321 sentinel_solution = best_solution | |
1322 best_solution_since += 1 | |
1323 else: | |
1324 if best_solution[2] < sentinel_solution[2]: | |
1325 sentinel_solution = best_solution | |
1326 best_solution_since = 0 | |
1327 else: | |
1328 best_solution_since += 1 | |
1329 | |
1330 resets += 1 | |
1331 | |
1332 duration = getUsedTime(start_time) | |
1333 | |
1334 retString += "|Ants:" + str(global_ant_count) | |
1335 retString += "|Resets:" + str(resets) + "/" + str(reset_limit) | |
1336 retString += "|AntsTC:" + str(termination_convergence) | |
1337 retString += "|CC:" + str(convergence_count) | |
1338 retString += "|IP:" + str(improve) | |
1339 retString += "|BSS:" + str(best_solution_since) | |
1340 #if GC_message != "": | |
1341 # retString += GC_message + "\n" | |
1342 | |
1343 sequence = best_solution[0] | |
1344 struct = best_solution[1] | |
1345 | |
1346 retString += "|LP:" + str(len(LP)) | |
1347 retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy)) | |
1348 retString += "|dGC:" + str(best_solution[4]) | |
1349 retString += "|GC:" + str(getGC(sequence)*100) | |
1350 retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence)) | |
1351 retString += "|L:" + str(len(sequence)) | |
1352 retString += "|Time:" + str(duration) | |
1353 retString2 += "\n" + struct + "\n" | |
1354 retString2 += sequence | |
1355 | |
1356 # CLOSING THE PIPES TO THE PROGRAMS | |
1357 RNAfold.communicate() | |
1358 #RNAdistance.communicate() | |
1359 | |
1360 else: # Structural premisses are not met, htherefore the program will halt with a failure message | |
1361 retString += "\nSome mistake detected\n" | |
1362 retString += "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " + str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n" | |
1363 | |
1364 return (retString, retString2) | |
1365 | |
1366 def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False): | |
1367 """ | |
1368 MAIN antaRNA - ant assembled RNA | |
1369 """ | |
1370 | |
1371 if seed != "none": | |
1372 random.seed(seed) | |
1373 | |
1374 if Cseq == "": | |
1375 sequenceconstraint = "N" * len(structure) | |
1376 else: | |
1377 sequenceconstraint = str(Cseq) | |
1378 | |
1379 alpha = float(alpha) | |
1380 beta = float(beta) | |
1381 tGC = float(tGC) | |
1382 evaporation_rate = float(evaporation_rate) | |
1383 struct_correction_term = float(struct_correction_term) | |
1384 GC_correction_term = float(GC_correction_term) | |
1385 seq_correction_term = float(seq_correction_term) | |
1386 colonies = int(colonies) | |
1387 file_id = str(file_id) | |
1388 verbose = verbose | |
1389 output_verbose = output_verbose | |
1390 correction_terms = struct_correction_term, GC_correction_term, seq_correction_term | |
1391 temperature = float(temperature) | |
1392 print_to_STDOUT = (file_id == "STDOUT") | |
1393 | |
1394 useGU = useGU | |
1395 | |
1396 if return_mod == False: | |
1397 if print_to_STDOUT == False: | |
1398 outfolder = '/'.join(file_id.strip().split("/")[:-1]) | |
1399 curr_dir = os.getcwd() | |
1400 if not os.path.exists(outfolder): | |
1401 os.makedirs(outfolder) | |
1402 os.chdir(outfolder) | |
1403 | |
1404 | |
1405 sequenceconstraint = transform(sequenceconstraint) | |
1406 ############################################### | |
1407 | |
1408 # Allowed deviation from the structural target: | |
1409 objective_to_target_distance = 0.0 | |
1410 | |
1411 # Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols | |
1412 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"} | |
1413 IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU) | |
1414 | |
1415 IUPAC_reverseComplements = {} | |
1416 if useGU == False: ## Without the GU basepair | |
1417 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"} | |
1418 else: ## allowing the GU basepair | |
1419 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"} | |
1420 | |
1421 result = [] | |
1422 for col in xrange(colonies): | |
1423 # Checking the kind of taget GC value should be used | |
1424 GC = getGCSamplingValue(tGC, tGCmax, tGCvar) | |
1425 | |
1426 # Actual execution of a ant colony procesdure | |
1427 output_v, output_w = runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy) | |
1428 | |
1429 # Post-Processing the output of a ant colony procedure | |
1430 line = ">" + name + str(col) | |
1431 if output_verbose: | |
1432 line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" + str(GC) + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + output_w | |
1433 else: | |
1434 line += output_w | |
1435 if return_mod == False: | |
1436 if print_to_STDOUT: | |
1437 print line | |
1438 else: | |
1439 if col == 0: | |
1440 print2file(file_id, line, 'w') | |
1441 else: | |
1442 print2file(file_id, line, 'a') | |
1443 else: | |
1444 result.append(line) | |
1445 | |
1446 if return_mod == True: | |
1447 return result | |
1448 if print_to_STDOUT == False: | |
1449 os.chdir(curr_dir) | |
1450 | |
1451 def execute(args): | |
1452 """ | |
1453 CHECK AND PARSE THE COMMAND LINE STUFF | |
1454 """ | |
1455 | |
1456 # Checking the arguments, parsed from the shell | |
1457 ############################################### | |
1458 name = args.name | |
1459 structure = args.Cstr | |
1460 | |
1461 if args.Cseq == "": | |
1462 sequenceconstraint = "N" * len(structure) | |
1463 else: | |
1464 sequenceconstraint = args.Cseq | |
1465 | |
1466 seed = args.seed | |
1467 | |
1468 | |
1469 alpha = args.alpha | |
1470 beta = args.beta | |
1471 tGC = args.tGC | |
1472 evaporation_rate = args.ER | |
1473 struct_correction_term = args.Cstrweight | |
1474 GC_correction_term = args.Cgcweight | |
1475 seq_correction_term = args.Cseqweight | |
1476 colonies = args.noOfColonies | |
1477 degreeOfSequenceInducement = args.level | |
1478 file_id = args.output_file | |
1479 verbose = args.verbose | |
1480 output_verbose = args.output_verbose | |
1481 | |
1482 tGCmax = args.tGCmax | |
1483 tGCvar = args.tGCvar | |
1484 | |
1485 termination_convergence = args.antsTerConv | |
1486 convergence_count = args.ConvergenceCount | |
1487 temperature = args.temperature | |
1488 reset_limit = args.Resets | |
1489 | |
1490 improve = args.improve_procedure | |
1491 | |
1492 ### RNAfold parameterfile | |
1493 paramFile = args.paramFile | |
1494 | |
1495 # Using the pkiss program under user changeable parameters | |
1496 pseudoknots = args.pseudoknots | |
1497 | |
1498 # Loading the optimized parameters for pseudoknots and ignore user input | |
1499 if args.pseudoknot_parameters: | |
1500 alpha = 1.0 | |
1501 beta = 0.1 | |
1502 evaporation_rate = 0.2 | |
1503 struct_correction_term = 0.1 | |
1504 GC_correction_term = 1.0 | |
1505 seq_correction_term = 0.5 | |
1506 termination_convergence = 50 | |
1507 convergence_count = 100 | |
1508 | |
1509 | |
1510 strategy = args.strategy | |
1511 useGU = args.useGUBasePair | |
1512 | |
1513 checkForViennaTools() | |
1514 if pseudoknots: | |
1515 checkForpKiss() | |
1516 findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU) | |
1517 | |
1518 | |
1519 def exe(): | |
1520 """ | |
1521 MAIN EXECUTABLE WHICH PARSES THE INPUT LINE | |
1522 """ | |
1523 | |
1524 argument_parser = argparse.ArgumentParser( | |
1525 description = """ | |
1526 Ant Colony Optimized RNA Sequence Design | |
1527 """, | |
1528 | |
1529 epilog = """ | |
1530 | |
1531 ######################################################################### | |
1532 # antaRNA - ant assembled RNA # | |
1533 # -> Ant Colony Optimized RNA Sequence Design # | |
1534 # ------------------------------------------------------------ # | |
1535 # Robert Kleinkauf (c) 2015 # | |
1536 # Bioinformatics, Albert-Ludwigs University Freiburg, Germany # | |
1537 ######################################################################### | |
1538 | |
1539 - For antaRNA only the VIENNNA RNA Package must be installed on your linux system. | |
1540 antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are | |
1541 not installed correctly, no output will be generated, an also no warning will be prompted. | |
1542 So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly! | |
1543 | |
1544 - antaRNA was only tested under Linux. | |
1545 | |
1546 - For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/ | |
1547 | |
1548 Example calls: | |
1549 python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2 | |
1550 python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov | |
1551 python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10 | |
1552 | |
1553 ######################################################################### | |
1554 # --- Hail to the Queen!!! All power to the swarm!!! --- # | |
1555 ######################################################################### | |
1556 """, | |
1557 #formatter_class=RawTextHelpFormatter | |
1558 ) | |
1559 | |
1560 # mandatorys | |
1561 argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True) | |
1562 argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True) | |
1563 argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int, required=True) | |
1564 argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true") | |
1565 | |
1566 argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none") | |
1567 argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method. h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s") | |
1568 argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5) | |
1569 argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130) | |
1570 argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50) | |
1571 | |
1572 argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true") | |
1573 argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true") | |
1574 argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A") | |
1575 | |
1576 # Mutual Exclusiv target GC distribution variables | |
1577 #tGCgroup = argument_parser.add_mutually_exclusive_group() | |
1578 argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0) | |
1579 argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0) | |
1580 | |
1581 argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0) | |
1582 argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="") | |
1583 argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT") | |
1584 argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "") | |
1585 argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1) | |
1586 argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_") | |
1587 argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0) | |
1588 argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0) | |
1589 argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2) | |
1590 argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5) | |
1591 argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0) | |
1592 argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0) | |
1593 argument_parser.add_argument("-v", "--verbose", help="Displayes intermediate output.\n\n", action="store_true") | |
1594 argument_parser.add_argument("-ov", "--output_verbose", help="Prints additional output to the headers of the produced sequences.\n\n", action="store_false") | |
1595 | |
1596 args = argument_parser.parse_args() | |
1597 | |
1598 execute(args) | |
1599 | |
1600 def checkForViennaTools(): | |
1601 """ | |
1602 Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance | |
1603 """ | |
1604 RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip() | |
1605 if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1: | |
1606 return True | |
1607 else: | |
1608 print "It seems the Vienna RNA Package is not installed on your machine. Please do so!" | |
1609 print "You can get it at http://www.tbi.univie.ac.at/" | |
1610 exit(0) | |
1611 | |
1612 | |
1613 def checkForpKiss(): | |
1614 """ | |
1615 Checking for the presence of pKiss | |
1616 """ | |
1617 pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip() | |
1618 if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1: | |
1619 return True | |
1620 else: | |
1621 print "It seems that pKiss is not installed on your machine. Please do so!" | |
1622 print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss" | |
1623 exit(0) | |
1624 | |
1625 | |
1626 | |
1627 if __name__ == "__main__": | |
1628 | |
1629 exe() | |
1630 |