comparison antaRNA.py @ 0:3abbf92e2ed2 draft

Uploaded
author rnateam
date Thu, 30 Apr 2015 08:27:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3abbf92e2ed2
1 import numpy
2 import sys
3 import random
4 import subprocess
5 import re
6 import decimal
7 import math
8 import os
9 import shutil
10 import time
11 import types
12 import argparse
13 #from argparse import RawTextHelpFormatter
14
15 #############################
16 # FUNCTIONS
17
18 def print2file(f, i, m):
19 """
20 print content i to file f in mode m
21 """
22 line = str(i)
23 if m == "a":
24 call = "echo \"" + line + "\" >> " + f
25 elif m == "w":
26 call = "echo \"" + line + "\" > " + f
27 os.system(call)
28
29 # checking and correcting the alphabet of the constraint sequence
30 def checkSequenceConstraint(SC):
31 """
32 Checks the Sequence constraint for illegal nucleotide characters
33 """
34 out = ""
35 for c in SC:
36 c = c.upper()
37 if c not in "ACGURYSWKMBDHVN":
38 # and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V":
39 if c == "T":
40 c = "U"
41 else:
42 print "\tIllegal Character in the constraint sequence!"
43 print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!"
44 print "\tA Adenine"
45 print "\tC Cytosine"
46 print "\tG Guanine"
47 print "\tT/U Thymine/Uracil"
48 print "\tR A or G"
49 print "\tY C or T/U"
50 print "\tS G or C"
51 print "\tW A or T/U"
52 print "\tK G or T/U"
53 print "\tM A or C"
54 print "\tB C or G or T/U"
55 print "\tD A or G or T/U"
56 print "\tH A or C or T/U"
57 print "\tV A or C or G"
58 print "\tN any base"
59 exit(0)
60 out += c
61 return (1, out)
62
63
64 def transform(seq):
65 """
66 Transforms "U" to "T" for the processing is done on DNA alphabet
67 """
68 S = ""
69 for s in seq:
70 if s == "T":
71 S += "U"
72 else:
73 S += s
74 return S
75
76
77 def checkSimilarLength(s, SC):
78 """
79 Compares sequence and structure constraint length
80 """
81 if len(s) == len(SC):
82 return 1
83 else:
84 return 0
85
86
87 def isStructure(s):
88 """
89 Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters.
90 """
91 returnvalue = 1
92 for a in range(0,len(s)):
93 if s[a] not in ".()[]{}<>":
94 if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
95 returnvalue = 0
96 return returnvalue
97
98
99 def isBalanced(s):
100 """
101 Check if the structure s is of a balanced nature
102 """
103
104 balance = 1
105 for bracket in ["()", "[]", "{}", "<>"]:
106 counter = 0
107 for a in xrange(len(s)):
108 if s[a] in bracket[0]:
109 counter += 1
110 elif s[a] in bracket[1]:
111 counter -= 1
112 if counter != 0:
113 balance = 0
114 return balance
115
116
117
118 def fulfillsHairpinRule(s):
119 """
120 CHECKING FOR THE 3 nt LOOP INTERSPACE
121 for all kind of basepairs, even wihtin the pdeudoknots
122 """
123
124 fulfillsRules = 1
125 for bracket in ["()", "[]", "{}", "<>"]:
126 last_opening_char = 0
127 check = 0
128 for a in xrange(len(s)):
129 if s[a] == bracket[0]:
130 last_opening_char = a
131 check = 1
132 elif s[a] == bracket[1] and check == 1:
133 check = 0
134 if a - last_opening_char < 4:
135 return 0
136 return 1
137
138
139 def isValidStructure(s):
140 """
141 Checks, if the structure s is a valid structure
142 """
143
144 Structure = isStructure(s)
145 Balanced = isBalanced(s)
146 HairpinRule = fulfillsHairpinRule(s)
147
148 if Structure == 1 and Balanced == 1 and HairpinRule == 1:
149 return 1
150 else:
151 print Structure, Balanced, HairpinRule
152 return 0
153
154 def loadIUPACcompatibilities(IUPAC, useGU):
155 """
156 Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES
157 """
158 compatible = {}
159 for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
160 sn1 = list(IUPAC[nuc1])
161 for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
162 sn2 = list(IUPAC[nuc2])
163 compatib = 0
164 for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
165 for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
166 # CHECKING THEIR COMPATIBILITY
167 if useGU == True:
168 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"):
169 compatib = 1
170 else:
171 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"):
172 compatib = 1
173 compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL
174 return compatible
175
176 def isCompatibleToSet(c1, c2, IUPAC_compatibles):
177 """
178 Checks compatibility of c1 wihtin c2
179 """
180 compatible = True
181 for setmember in c2:
182 #print setmember
183 if isCompatible(c1, setmember, IUPAC_compatibles) == False:
184 return False
185 return compatible
186
187
188 def isCompatible(c1, c2, IUPAC_compatibles):
189 """
190 Checks compatibility between character c1 and c2
191 """
192 if IUPAC_compatibles[c1 + "_" + c2] == 1:
193 return True
194 else:
195 return False
196
197
198 def isStructureCompatible(lp1, lp2 ,bp):
199 """
200 Checks, if the region within lp1 and lp2 is structurally balanced
201 """
202 x = lp1 + 1
203 while (x < lp2):
204 if (bp[x] <= lp1 or bp[x] > lp2):
205 return False
206 if x == bp[x]:
207 x += 1
208 else:
209 x = bp[x] + 1
210 return x == lp2
211
212
213 def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles):
214 """
215 Checks if the constraints are compatible to each other
216 """
217 returnstring = ""
218 compatible = 1
219 for id1 in basepairstack: # key = (constraint , (pos, constraint)))
220 constr1 = basepairstack[id1][0]
221 id2 = basepairstack[id1][1][0]
222 constr2 = basepairstack[id1][1][1]
223
224 if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles):
225
226 compatible = 0
227 returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n"
228 #if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]):
229
230 #compatible = 0
231 #else:
232 #returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n"
233 return (compatible, returnstring)
234
235
236 def getLP(BPSTACK):
237 """
238 Retreives valid lonley base pairs from a base pair stack
239 """
240 #20 ('N', (>BLOCK<, 'N'))
241
242 # geting single base pairs
243 stack = {}
244 LP = {}
245 if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
246 for i in BPSTACK.keys():
247 #if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
248 stack[i] = int(BPSTACK[i][1][0])
249 #print i , BPSTACK[i][1][0]
250 else:
251 for i in BPSTACK.keys():
252 #if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
253 stack[i] = BPSTACK[i]
254
255 # removing redundant base pair indices
256 for i in stack.keys():
257 if i >= stack[i]:
258 del stack[i]
259
260 # actual checking for single lonley base pairs
261 for i in stack.keys():
262 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1):
263 LP[i] = stack[i]
264
265 ##actual removal of 2er lonley base pairs
266 for i in stack.keys():
267 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2):
268 LP[i] = stack[i]
269 LP[i+1] = stack[i+1]
270
271
272 #if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
273 #for i in BPSTACK.keys():
274
275 ##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
276 #stack[i] = int(BPSTACK[i][1][0])
277 ##print i , BPSTACK[i][1][0]
278 #else:
279 #for i in BPSTACK.keys():
280 ##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
281 #stack[i] = BPSTACK[i]
282
283 #for i in stack.keys():
284 #if i >= stack[i]:
285 #del stack[i]
286
287
288
289 return LP
290
291
292 def getBPStack(s, seq):
293 """
294 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
295 """
296 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
297 bpstack = {}
298 for i in xrange(len(s)):
299
300 # REGULAR SECONDARY STRUCTURE DETECTION
301 if s[i] in "(){}[]<>":
302
303 no = 0
304 ### opening
305 if s[i] in "([{<":
306 if s[i] == "(":
307 tmp_stack["()"].append((i, seq[i]))
308 elif s[i] == "[":
309 tmp_stack["[]"].append((i, seq[i]))
310 elif s[i] == "{":
311 tmp_stack["{}"].append((i, seq[i]))
312 elif s[i] == "<":
313 tmp_stack["<>"].append((i, seq[i]))
314
315 #closing
316 elif s[i] in ")]}>":
317 if s[i] == ")":
318 no, constr = tmp_stack["()"].pop()
319 elif s[i] == "]":
320 no, constr = tmp_stack["[]"].pop()
321 elif s[i] == "}":
322 no, constr = tmp_stack["{}"].pop()
323 elif s[i] == ">":
324 no, constr = tmp_stack["<>"].pop()
325 bpstack[no] = (constr, (i, seq[i]))
326 bpstack[i] = (seq[i] ,(no, constr))
327
328 elif s[i] == ".":
329 bpstack[i] = (seq[i], (i, seq[i]))
330 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
331 bpstack[i] = (seq[i], (i, seq[i]))
332
333 return (bpstack, getLP(bpstack))
334
335
336 def getbpStack(s):
337 """
338 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
339 """
340 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
341 bpstack = {}
342
343 for i in xrange(len(s)):
344 if s[i] in "(){}[]<>":
345
346 no = 0
347 ### opening
348 if s[i] in "([{<":
349 if s[i] == "(":
350 tmp_stack["()"].append(i)
351 elif s[i] == "[":
352 tmp_stack["[]"].append(i)
353 elif s[i] == "{":
354 tmp_stack["{}"].append(i)
355 elif s[i] == "<":
356 tmp_stack["<>"].append(i)
357
358 #closing
359 elif s[i] in ")]}>":
360 if s[i] == ")":
361 no = tmp_stack["()"].pop()
362 elif s[i] == "]":
363 no = tmp_stack["[]"].pop()
364 elif s[i] == "}":
365 no = tmp_stack["{}"].pop()
366 elif s[i] == ">":
367 no = tmp_stack["<>"].pop()
368 bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))}
369 bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))}
370
371 elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner....
372 bpstack[i] = i
373
374 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
375 bpstack[i] = i
376
377 #elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
378 ## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific
379 ## block character
380 #bpstack[i] = s[i]
381
382 return (bpstack, getLP(bpstack))
383
384 def maprange( a, b, s):
385 """
386 Mapping function
387 """
388 (a1, a2), (b1, b2) = a, b
389 return b1 + ((s - a1) * (b2 - b1) / (a2 - a1))
390
391
392 def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt):
393 """
394 GC path length contribution calculation.
395 """
396 GCadjustment = 1.5
397 minimum = 0.5
398 upper = GCadjustment
399 lower = minimum
400
401 if nt == "A" or nt == "U":
402 pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC)
403
404 if nt == "G" or nt == "C":
405 #pathlength = pathlength * (float(1-tmpGC))
406 pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC)
407 return pathlength
408
409 def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
410 """
411 Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section)
412 """
413 # TE :: transition element / path section under dispute
414 # id1 :: id of the position of the caharacter to which the transition is leading to
415 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
416 # val :: BPstack information of the specific position
417 # constr1 :: constraining character of pos id1
418 # constr2 :: constraining character of pos id2
419
420 id1 = int(TE.split(".")[0])
421 val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack
422 constr1 = val[0] # getting the constraint character of position id1
423 id2 = int(val[1][0]) # getting position id2
424 constr2 = val[1][1] # getting the sequence constraint for position id2
425 targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to
426
427 c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1
428 c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2
429
430 if targetNucleotide in c1:
431 if id1 == id2:
432 return 1
433 else:
434 if targetNucleotide in c2:
435 return 1
436 else:
437 return 0
438 else:
439 return 0
440
441 """
442 def getConstraint(TE, BPstack):
443 # TE :: transition element / path section
444 # id1 :: id of the position of the caharacter to which the transition is leading to
445 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
446 # val :: BPstack information of the specific position
447 # constr1 :: constraining character of pos id1
448 # constr2 :: constraining character of pos id2
449
450 ### BPstack [id1] = (constr1, (id2, constr2))
451
452 id1 = TE.split(".")[0]
453 #print id1
454 #id1 = TE.find(TE.strip("_")) # strip the path section and getting the position of the section
455 #if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition
456 #id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character
457 val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack
458 constr1 = val[0] # getting the constraint character of position id1
459 id2 = val[1][0] # getting position id2
460 constr2 = val[1][1] # getting the sequence constraint for position id2
461 #print TE, id1, constr1, id2, constr2,
462
463 #TE.split(".")[1][-1:]
464 if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint
465 if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1
466 #print 1
467 return 1
468 elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint
469 #print 1
470 return 1
471 else: # single base sequence constraints differ
472 #print 0
473 return 0
474
475 elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint)
476 if constr1 == "N" and constr2 == "N": # no sequence constraint
477 #print 1
478 return 1
479 if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases)
480 if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2
481 #print 1
482 return 1
483 else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2
484 #print 0
485 return 0
486 if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base)
487 if TE.split(".")[1][-1:] == constr1: # the current path section destination base is as constrained with constr1
488 #print 1
489 return 1
490 else: # the current path section destination base is not as constrained in constr1
491 #print 0
492 return 0
493 if constr1 != "N" and constr2 != "N": # both positions have sequential constraint
494 if TE.split(".")[1][-1:] == constr1:
495 #print 1
496 return 1
497 else:
498 #print 0
499 return 0
500 """
501
502 def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
503 #nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3}
504
505 dels = []
506 for terrainelement in sorted(terrain):
507 pheromone, pathlength = terrain[terrainelement]
508 pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
509 pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
510 pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:])
511 if pheromone * pathlength == 0: dels.append(terrainelement)
512 terrain[terrainelement] = (pheromone, pathlength,[])
513 further_dels = {}
514 for terrainelement in sorted(dels):
515 pos, nucs = terrainelement.split(".")
516 if int(pos) < len(s)-1:
517 to_nt = nucs[-1:]
518 successor_pos = int(pos) + 1
519 for i in ["A", "C", "G", "U"]:
520 del_element = str(successor_pos) + "." + to_nt + i
521 further_dels[del_element] = 1
522 further_dels[terrainelement] = 1
523 # deleting the inbound and outbound edges, which are forbidden
524 for terrainelement in further_dels:
525 del terrain[terrainelement]
526 # allocate the appropriate children of edges
527 for terrainelement in terrain:
528 pheromone, pathlength, children = terrain[terrainelement]
529 pos, nucs = terrainelement.split(".")
530 if int(pos) < len(s):
531 to_nt = nucs[-1:]
532 successor_pos = int(pos) + 1
533 for i in ["A", "C", "G", "U"]:
534 if str(successor_pos) + "." + to_nt + i in terrain:
535 children.append(str(successor_pos) + "." + to_nt + i)
536 terrain[terrainelement] = (pheromone, pathlength,children)
537 starts = []
538 for i in ["A", "C", "G", "U"]:
539 if str(0) + "." + i in terrain:
540 starts.append(str(0) + "." + i)
541 terrain["00.XY"] = (1, 1, starts)
542 return (terrain, BPstack)
543
544
545 def initTerrain(s): # THE CLASSIC
546 """
547 Initialization of the terrain with graph like terrain... vertices are modeled implicitly
548 """
549 nt = ["A","C","G","U"]
550 nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides
551 e = {}
552 pathlength = 1
553 pheromone = 1
554 for p in xrange(len(s)):
555 if p == 0:
556 for i in nt:
557 e["%s.%s"%(p,i)] = (pheromone, pathlength)
558 elif p > 0:
559 for n in nt2:
560 e["%s.%s"%(p,n)] = (pheromone, pathlength)
561 return e
562
563
564
565 def complementBase(c):
566 """
567 Returns the complement RNA character of c (without GU base pairs)
568 """
569 retChar = ""
570 if c == "A" :
571 retChar = "U"
572 elif c == "U":
573 retChar = "A"
574 elif c == "C":
575 retChar = "G"
576 elif c == "G":
577 retChar = "C"
578 return retChar
579
580 def printTerrain(terrain):
581 #print sorted(terrain.keys())
582 tmp_i = "0"
583 tmp_c = 0
584 terrain = terrain[0]
585
586 for a, i in enumerate(sorted(terrain.keys())):
587 #print a
588 if i.split(".")[0] != tmp_i:
589 print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i]
590
591 tmp_c = 1
592 tmp_i = i.split(".")[0]
593 else:
594 print i, terrain[i]
595 tmp_c += 1
596
597 print "\nElements:", tmp_c
598 print "#########################"
599 print len(terrain)
600
601 def pickStep(tmp_steps, summe):
602 """
603 Selects a step within the terrain
604 """
605 if len(tmp_steps) == 1:
606 return tmp_steps[0][1] # returning the nucleotide of the only present step
607 else:
608 rand = random.random() # draw random number
609 mainval = 0
610 for choice in xrange(len(tmp_steps)):
611 val, label = tmp_steps[choice]
612 mainval += val/float(summe)
613 if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done
614 return label
615
616 def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements):
617 """
618 Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity
619 of the base pairs GU, GC and AT
620 """
621 nt = ["A","C","G","U"]
622 prev_edge = "00.XY"
623 sequence = ""
624 while len(sequence) < len(s):
625 coming_from = sequence[-1:]
626 summe = 0
627 steps = []
628 i = len(sequence)
629 allowed_nt = "ACGU"
630 # base pair closing case check, with subsequent delivery of a reduced allowed nt set
631
632 if i > tmp_BPstack[i][1][0]:
633 jump = tmp_BPstack[i][1][0]
634 nuc_at_jump = sequence[jump]
635 allowed_nt = IUPAC_reverseComplements[nuc_at_jump]
636
637 #allowed_nt = complementBase(nuc_at_jump)
638
639 # Checking for every possible nt if it is suitable for the selection procedure
640 for edge in tmp_terrain[prev_edge][-1]:
641
642 if edge[-1:] in allowed_nt:
643 pheromone, PL , children = tmp_terrain[edge]
644 #if PL > 0:
645 value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta))
646 summe += value
647 steps.append((value, edge))
648 prev_edge = pickStep(steps, summe)
649 sequence += prev_edge[-1:]
650
651 return sequence
652
653
654 ###
655 # STRUCTURE PREDICTORS
656 ###
657 def getPKStructure(sequence, temperature, mode = "A"):
658 """
659 Initialization pKiss mfe pseudoknot prediction
660 """
661 p2p = "pKiss"
662 #p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe"
663 strategy = "--strategy "
664 t = "--temperature " + str(temperature)
665
666 if mode == "A": strategy += "A"
667 elif mode == "B": strategy += "B"
668 elif mode == "C": strategy += "C"
669 elif mode == "D": strategy += "D"
670 elif mode == "P": strategy += "P"
671
672 p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]),
673 #shell = True,
674 stdin = subprocess.PIPE,
675 stdout = subprocess.PIPE,
676 stderr = subprocess.PIPE,
677 close_fds = True)
678 #print p.stderr.readline()
679
680 p.stdin.write(sequence+'\n')
681 pks = p.communicate()
682 structure = "".join(pks[0].split("\n")[2].split(" ")[-1:])
683 return structure
684
685 def init_RNAfold(temperature, paramFile = ""):
686 """
687 Initialization RNAfold listener
688 """
689 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold"
690 p2p = "RNAfold"
691
692 t = "-T " + str(temperature)
693 P = ""
694 if paramFile != "":
695 P = "-P " + paramFile
696 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
697 #shell = True,
698 stdin = subprocess.PIPE,
699 stdout = subprocess.PIPE,
700 stderr = subprocess.PIPE,
701 close_fds = True)
702 return p
703
704
705 def consult_RNAfold(seq, p):
706 """
707 Consults RNAfold listener
708 """
709 p.stdin.write(seq+'\n')
710 out = ""
711 for i in xrange(2):
712 out += p.stdout.readline()
713 return out
714
715
716 def getRNAfoldStructure(struct2, process1):
717 """
718 Retrieves folded structure of a RNAfold call
719 """
720
721 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
722 #RNAdist_pattern = re.compile('.*\s([\d]+)')
723 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1))
724 current_structure = ""
725 #if RNAfold_match:
726 return RNAfold_match.group(1)
727
728
729 def init_RNAdistance():
730 """
731 Initialization of RNAdistance listener
732 """
733 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance"
734 p2p = "RNAdistance"
735 p = subprocess.Popen( ([p2p]),
736 #shell = True,
737 stdin = subprocess.PIPE,
738 stdout = subprocess.PIPE,
739 stderr = subprocess.PIPE,
740 close_fds = True)
741 return p
742
743
744 def consult_RNAdistance(s1, s2, p):
745 """
746 Consulting the RNAdistance listener
747 """
748 p.stdin.write(s1+'\n')
749 p.stdin.write(s2+'\n')
750 out = ""
751 out_tmp = p.stdout.readline().strip()
752 if out_tmp != "":
753 out += out_tmp
754 return out
755
756 def getInducingSequencePositions(Cseq, degreeOfSequenceInducement):
757 """
758 Delimiting the degree of structure inducement by the supplied sequence constraint.
759 0 : no sequence induced structure constraint
760 1 : "ACGT" induce structure (explicit nucleotide structure inducement level)
761 2 : "MWKSYR" and "ACGT" (explicit and double instances)
762 3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances)
763 """
764 setOfNucleotides = "" # resembling the "0"-case
765 if degreeOfSequenceInducement == 1:
766 setOfNucleotides = "ACGU"
767 elif degreeOfSequenceInducement == 2:
768 setOfNucleotides = "ACGUMWKSYR"
769 elif degreeOfSequenceInducement == 3:
770 setOfNucleotides = "ACGUMWKSYRBDHV"
771 #elif degreeOfSequenceInducement == 4:
772 #setOfNucleotides = "ACGTMWKSYRBDHVN"
773
774 tmpSeq = ""
775 listset = setOfNucleotides
776 for pos in Cseq:
777 if pos not in listset:
778 tmpSeq += "N"
779 else:
780 tmpSeq += pos
781
782 return setOfNucleotides, tmpSeq
783
784
785 def getBPDifferenceDistance(stack1, stack2):
786 """
787 Based on the not identical amount of base pairs within both structure stacks
788 """
789 d = 0
790 for i in stack1.keys():
791 # check base pairs in stack 1
792 if i < stack1[i] and stack1[i] != stack2[i]:
793 d += 1
794 # check base pairs in stack 2
795 for i in stack2.keys():
796 if i < stack2[i] and stack1[i] != stack2[i]:
797 d += 1
798 return d
799
800
801 def getStructuralDistance(target_structure, Cseq, path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy):
802 """
803 Calculator for Structural Distance
804 """
805 # fold the current solution's sequence to obtain the structure
806
807 current_structure = ""
808
809 if pseudoknots:
810 current_structure = getPKStructure(path,strategy)
811 else:
812 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold))
813 current_structure = RNAfold_match.group(1)
814
815 # generate the current structure's base-pair stack
816 bp = getbpStack(current_structure)[0]
817 # add case-dependend structural constraints in case of lonley basepairs formation
818 tmp_target_structure_bp = getbpStack(target_structure)[0]
819
820 for lp in LP:
821 if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure.
822 #tmp_target_structure[lp] = "("
823 #tmp_target_structure[LP[lp]] = ")"
824 tmp_target_structure_bp[lp] = LP[lp]
825 tmp_target_structure_bp[LP[lp]] = lp
826
827 # REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure
828 # check for all allowed implicit constraint block declarators
829 for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
830 occurances = []
831 for m in re.finditer(c, target_structure): # search for a declarator in the requested structure
832 occurances.append(m.start()) # save the corresponding index
833
834 # transform declarator into single stranded request
835 for i in occurances:
836 #tmp_target_structure[i] = "."
837 tmp_target_structure_bp[i] = i
838 # infer a base pair within the block declarated positions, if the current structure provides it.
839 for i in occurances:
840 for j in occurances:
841 if i < j:
842 if bp[i] == j:
843 #tmp_target_structure[i] = "("
844 #tmp_target_structure[bp[i]] = ")"
845
846 tmp_target_structure_bp[i] = bp[i]
847 tmp_target_structure_bp[bp[i]] = i
848
849 # CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION
850 #print "Checking Cseq influence and it's induced basepairs..."
851 IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement)
852 if len(Cseq.strip("N")) > 0:
853 #print "Processing Cseq influence"
854 # Iterate over all positions within the Base Pair stack
855 for i in BP: # Check for each base index i
856
857 if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution
858
859 bp_j = bp[i] # Actual j index of the current solution
860 BP_j = BP[i][1][0] # j index of the requested structure
861 if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce...
862 if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs
863 #if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible
864 #tmp_target_structure[i] = "("
865 #tmp_target_structure[bp_j] = ")"
866
867 tmp_target_structure_bp[i] = bp[i]
868 tmp_target_structure_bp[bp_j] = i
869
870 #tts = "".join(tmp_target_structure)
871 dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp)
872
873 # CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed
874 failLP = 0
875 for lp in LP:
876
877 if bp[lp] != LP[lp]:
878
879 isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles)
880 isStru = isStructureCompatible(lp, LP[lp] ,bp)
881 if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the
882 # basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair
883 failLP += 1
884
885 #print dsreg, failLP, float(len(tmp_target_structure_bp))
886 dsLP = float(failLP)
887
888 return (dsreg + dsLP) /float(len(tmp_target_structure_bp))
889
890
891 def getGC(sequence):
892 """
893 Calculate GC content of a sequence
894 """
895 GC = 0
896 for nt in sequence:
897 if nt == "G" or nt == "C":
898 GC = GC + 1
899 GC = GC/float(len(sequence))
900 return GC
901
902
903 def getGCDistance(tGC, gc2, L):
904 """
905 Calculate the pseudo GC content distance
906 """
907 nt_coeff = L * tGC
908 pc_nt = (1/float(L))*100
909 #
910 d = gc2 - tGC
911 d = d * 100
912
913 f = math.floor(nt_coeff)
914 c = math.ceil(nt_coeff)
915
916 if d < 0: #
917 #print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt,
918 d = d + (abs(nt_coeff - f)) * pc_nt
919 elif d > 0: # case y
920 #print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt,
921 d = d - abs(nt_coeff - c) * pc_nt
922 elif d == 0:
923 pass
924
925 d = round(d, 7)
926 #d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 )
927 return abs(d)
928
929
930 def getSequenceEditDistance(SC, path):
931 """
932 Calculate sequence edit distance of a solution to the constraint
933 """#
934 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
935 edit = 0
936 for i in xrange(len(SC)):
937 if path[i] not in IUPAC[SC[i]]:
938 edit += 1
939 return edit/float(len(path))
940
941
942
943 def getTransitions(p):
944 """
945 Retreive transitions of a specific path/sequence
946 """
947 transitions = []
948 for pos in xrange(len(p)):
949 if pos == 0:
950 transitions.append(str(pos) + "." + p[pos])
951
952 else:
953 insert = p[pos-1] + p[pos]
954 transitions.append(str(pos) + "." + insert)
955
956 return transitions
957
958
959 def evaporate(t, er):
960 """
961 Evaporate the terrain's pheromone trails
962 """
963 terr, BP = t
964 c = 1
965 for key in terr:
966 p,l,c = terr[key]
967 p *= (1-er)
968 terr[key] = (p, l, c)
969
970
971 def updateValue(distance, correction_term, omega):
972 """
973 Retrieves a distance dependend pheromone value
974 """
975 if correction_term == 0:
976 return 0
977 else:
978 if distance == 0:
979 return omega * correction_term
980 else:
981 return (1/float(distance)) * correction_term
982
983
984 def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose):
985 """
986 Pheromone Update function accorinding to the quality of the solution
987 """
988 terr, BP = t
989 bpstack, LP = getbpStack(c_s)
990
991 struct_correction_term , GC_correction_term, seq_correction_term = correction_terms
992 omega = 2.23
993
994 bs = updateValue(ds, struct_correction_term, omega)
995 bGC = updateValue(dgc, GC_correction_term, omega)
996 if dseq != "n.a.":
997 bSeq = updateValue(dseq, seq_correction_term, omega)
998 d = bs + bGC + bSeq
999 else:
1000 d = bs + bGC
1001 transitions = getTransitions(p)
1002
1003 for trans in xrange(len(transitions)): # for each transition in the path
1004 id1 = int(transitions[trans].split(".")[0])
1005 tar_id2 = int(BPstack[id1][1][0]) # getting requested position id2
1006 curr_id2 = int(bpstack[id1]) # getting the current situation
1007 multiplicator = 0
1008 if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position
1009 multiplicator = 1
1010 elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures
1011 multiplicator = 1
1012 p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition
1013 p += d * multiplicator
1014 terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's
1015 t = (terr, BP)
1016
1017
1018 def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count):
1019 """
1020 General updating function
1021 """
1022 evaporate(t,er)
1023 trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose)
1024
1025
1026 def getUsedTime(start_time):
1027 """
1028 Return the used time between -start time- and now.
1029 """
1030 end_time = time.time()
1031 return end_time - start_time
1032
1033
1034 def good2Go(SC, L, CC, STR):
1035 """
1036 Check, if all input is correct and runnable
1037 """
1038 if (SC == 1 and L == 1 and CC == 1 and STR == 1):
1039 return True
1040 else:
1041 print SC,L,CC,STR
1042 return False
1043
1044
1045 def getPathFromSelection( aps, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy):
1046 """
1047 Returns the winning path from a selection of pathes...
1048 """
1049 terr, BPs = terrain
1050 win_path = 0
1051 for i in xrange(aps):
1052 # Generate Sequence
1053 path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements)
1054 # Measure sequence features and transform them into singular distances
1055 distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
1056 distance_GC = float(getGCDistance(GC,getGC(path), len(path)))
1057 distance_seq = float(getSequenceEditDistance(SC, path))
1058 # Calculate Distance Score
1059 D = distance_structural + distance_GC + distance_seq
1060
1061 # SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score
1062 if i == 0:
1063 win_path = (path, D, distance_structural, distance_GC, distance_seq)
1064 else:
1065 if D < win_path[1]:
1066 win_path = (path, D, distance_structural, distance_GC, distance_seq)
1067 return win_path
1068
1069
1070 def substr(x, string, subst):
1071 """
1072 Classical substring function
1073 """
1074 s1 = string[:x-1]
1075
1076 s2 = string[x-1:x]
1077 s3 = string[x:]
1078 #s2 = s[x+len(string)-x-1:]
1079
1080 return s1 + subst + s3
1081
1082
1083 def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc):
1084 """
1085 Check if a solutions qualities are within the convergence corridor
1086 """
1087 struct_var = ((BS_d_struct/float(4)) + 3 ) * 4
1088 gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1
1089
1090 if d_struct <= struct_var and d_gc <= gc_var:
1091 return True
1092 else:
1093 return False
1094
1095 def getGCSamplingValue(GC, tGCmax, tGCvar):
1096 """
1097 Returns a suitable GC value, dependend on the user input: Either returning the single GC value,
1098 which the user entered, or a smpled GC value
1099 from a designated distribution in it's interavals
1100 """
1101 returnval = 0
1102 if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested
1103 return GC
1104 elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling
1105 if GC < tGCmax:
1106 tmp_GC = tGCmax
1107 tGCmax = GC
1108 GC = tmp_GC
1109 while returnval <= 0:
1110 returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1))
1111 return returnval
1112 elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling
1113 while returnval <= 0:
1114 returnval = float(numpy.random.normal(GC, tGCvar, 1))
1115 return returnval
1116
1117
1118 def reachableGC(C_struct):
1119 """
1120 Checks if a demanded GC target content is reachable in dependence with the given sequence constraint.
1121 """
1122 AU = 0
1123 for i in C_struct:
1124 if i == "A" or i == "U":
1125 AU += 1
1126 maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC
1127 return maxGC
1128
1129
1130 def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy):
1131 """
1132 Execution function of a single ant colony finding one solution sequence
1133 """
1134 retString = ""
1135 retString2 = ""
1136 BPstack, LP = getBPStack(s, SC)
1137
1138 rGC = reachableGC(SC)
1139 GC_message = ""
1140 if GC > rGC:
1141 print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC)
1142 GC = rGC
1143
1144 # Initial Constraint Checks prior to execution
1145 STR = isValidStructure(s)
1146 START_SC , SC = checkSequenceConstraint(str(SC))
1147 START_LENGTH = checkSimilarLength(str(s), str(SC))
1148 START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles)
1149
1150 g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR)
1151 if (g2g == 1):
1152 start_time = time.time()
1153 max_time = 600 # seconds
1154
1155
1156
1157
1158 ####
1159 # INITIALIZATION OF THE RNA TOOLs
1160 #
1161 RNAfold = init_RNAfold(temperature, paramFile)
1162 #RNAdistance = init_RNAdistance()
1163 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
1164 #RNAdist_pattern = re.compile('.*\s([\d]+)')
1165 #
1166 ####
1167
1168 terrain = initTerrain(s)
1169 #print len(terrain),
1170 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
1171 #print len(terrain[0])
1172 #printTerrain(terrain)
1173 #exit(0)
1174 global_ant_count = 0
1175 global_best_ants = 0
1176 criterion = False
1177 met = True
1178 ant_no = 1
1179 prev_res = 0
1180 seq = ""
1181
1182 counter = 0
1183
1184 dstruct_log = []
1185 dGC_log = []
1186
1187
1188 distance_structural = 1000
1189 distance_GC = 1000
1190 distance_seq = 1000
1191
1192 convergence = convergence_count
1193 convergence_counter = 0
1194
1195 resets = 0
1196
1197 path = ""
1198 curr_structure = ""
1199
1200 Dscore = 100000
1201 distance_structural = 10000
1202 distance_GC = 10000
1203 distance_seq = 10000
1204 best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1205 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1206
1207 best_solution_since = 0
1208
1209 ants_per_selection = 10
1210 if len(LP) > 0 :
1211 for lp in LP:
1212 s = substr(lp + 1, s, ".")
1213 s = substr(LP[lp] + 1, s, ".")
1214
1215 init = 1
1216 while criterion != met and getUsedTime(start_time) < max_time:
1217 iteration_start = time.time()
1218 global_ant_count += 1
1219 global_best_ants += 1
1220
1221 path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy)
1222
1223 distance_structural_prev = distance_structural
1224 distance_GC_prev = distance_GC
1225 distance_seq_prev = distance_seq
1226
1227 path, Dscore , distance_structural, distance_GC, distance_seq = path_info
1228 curr_structure = ""
1229 if pseudoknots:
1230 curr_structure = getPKStructure(path, strategy)
1231 else:
1232 curr_structure = getRNAfoldStructure(path, RNAfold)
1233
1234 curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1235 # BEST SOLUTION PICKING
1236 if improve == "h": # hierarchical check
1237 # for the global best solution
1238 if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]):
1239 best_solution = curr_solution
1240 ant_no = 1
1241 # for the local (reset) best solution
1242 if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]):
1243 best_solution_local = curr_solution
1244
1245 elif improve == "s": #score based check
1246 # store best global solution
1247 if Dscore < best_solution[2]:
1248 best_solution = curr_solution
1249 ant_no = 1
1250 # store best local solution for this reset
1251 if Dscore < best_solution_local[2]:
1252 best_solution_local = curr_solution
1253
1254 # OLD ' BEST SOLUTION ' PICKING
1255 # if Dscore < best_solution[2]:
1256 # best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1257 #
1258 # if Dscore < best_solution_local[2]:
1259 # best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1260
1261
1262 distance_DN = 0
1263
1264 if verbose:
1265 print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection) + " cc " + str(convergence_counter)
1266
1267 print s, " <- target struct"
1268 print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0])
1269 print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats"
1270 print curr_structure, " <- CS"
1271 print path,
1272 print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq
1273
1274 #### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION
1275 updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count)
1276 ####
1277 if verbose: print "Used time for one iteration", time.time() - iteration_start
1278
1279
1280 # CONVERGENCE AND TERMINATION CRITERION MANAGEMENT
1281 #print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4]
1282 if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]):
1283 convergence_counter += 1
1284 if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC:
1285 convergence_counter += 1
1286
1287 if best_solution[3] == objective_to_target_distance:
1288 if best_solution[4] == 0.0:
1289 break
1290 ant_no = ant_no + 1
1291 convergence_counter -= 1
1292 else:
1293 ant_no = 1
1294
1295
1296 if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5:
1297 break
1298
1299 # RESET
1300 if ant_no < termination_convergence and convergence_counter >= convergence:
1301
1302 terrain = initTerrain(s)
1303 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
1304 criterion = False
1305 met = True
1306 ant_no = 1
1307 prev_res = 0
1308 pre_path = "_" * len(s)
1309 path = ""
1310 curr_structure = ""
1311 counter = 0
1312 Dscore = 100000
1313 distance_structural = 1000
1314 distance_GC = 1000
1315 distance_seq = 1000
1316 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1317 convergence = convergence_count
1318 convergence_counter = 0
1319
1320 if resets == 0:
1321 sentinel_solution = best_solution
1322 best_solution_since += 1
1323 else:
1324 if best_solution[2] < sentinel_solution[2]:
1325 sentinel_solution = best_solution
1326 best_solution_since = 0
1327 else:
1328 best_solution_since += 1
1329
1330 resets += 1
1331
1332 duration = getUsedTime(start_time)
1333
1334 retString += "|Ants:" + str(global_ant_count)
1335 retString += "|Resets:" + str(resets) + "/" + str(reset_limit)
1336 retString += "|AntsTC:" + str(termination_convergence)
1337 retString += "|CC:" + str(convergence_count)
1338 retString += "|IP:" + str(improve)
1339 retString += "|BSS:" + str(best_solution_since)
1340 #if GC_message != "":
1341 # retString += GC_message + "\n"
1342
1343 sequence = best_solution[0]
1344 struct = best_solution[1]
1345
1346 retString += "|LP:" + str(len(LP))
1347 retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
1348 retString += "|dGC:" + str(best_solution[4])
1349 retString += "|GC:" + str(getGC(sequence)*100)
1350 retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence))
1351 retString += "|L:" + str(len(sequence))
1352 retString += "|Time:" + str(duration)
1353 retString2 += "\n" + struct + "\n"
1354 retString2 += sequence
1355
1356 # CLOSING THE PIPES TO THE PROGRAMS
1357 RNAfold.communicate()
1358 #RNAdistance.communicate()
1359
1360 else: # Structural premisses are not met, htherefore the program will halt with a failure message
1361 retString += "\nSome mistake detected\n"
1362 retString += "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " + str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n"
1363
1364 return (retString, retString2)
1365
1366 def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False):
1367 """
1368 MAIN antaRNA - ant assembled RNA
1369 """
1370
1371 if seed != "none":
1372 random.seed(seed)
1373
1374 if Cseq == "":
1375 sequenceconstraint = "N" * len(structure)
1376 else:
1377 sequenceconstraint = str(Cseq)
1378
1379 alpha = float(alpha)
1380 beta = float(beta)
1381 tGC = float(tGC)
1382 evaporation_rate = float(evaporation_rate)
1383 struct_correction_term = float(struct_correction_term)
1384 GC_correction_term = float(GC_correction_term)
1385 seq_correction_term = float(seq_correction_term)
1386 colonies = int(colonies)
1387 file_id = str(file_id)
1388 verbose = verbose
1389 output_verbose = output_verbose
1390 correction_terms = struct_correction_term, GC_correction_term, seq_correction_term
1391 temperature = float(temperature)
1392 print_to_STDOUT = (file_id == "STDOUT")
1393
1394 useGU = useGU
1395
1396 if return_mod == False:
1397 if print_to_STDOUT == False:
1398 outfolder = '/'.join(file_id.strip().split("/")[:-1])
1399 curr_dir = os.getcwd()
1400 if not os.path.exists(outfolder):
1401 os.makedirs(outfolder)
1402 os.chdir(outfolder)
1403
1404
1405 sequenceconstraint = transform(sequenceconstraint)
1406 ###############################################
1407
1408 # Allowed deviation from the structural target:
1409 objective_to_target_distance = 0.0
1410
1411 # Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols
1412 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
1413 IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU)
1414
1415 IUPAC_reverseComplements = {}
1416 if useGU == False: ## Without the GU basepair
1417 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"}
1418 else: ## allowing the GU basepair
1419 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"}
1420
1421 result = []
1422 for col in xrange(colonies):
1423 # Checking the kind of taget GC value should be used
1424 GC = getGCSamplingValue(tGC, tGCmax, tGCvar)
1425
1426 # Actual execution of a ant colony procesdure
1427 output_v, output_w = runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy)
1428
1429 # Post-Processing the output of a ant colony procedure
1430 line = ">" + name + str(col)
1431 if output_verbose:
1432 line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" + str(GC) + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + output_w
1433 else:
1434 line += output_w
1435 if return_mod == False:
1436 if print_to_STDOUT:
1437 print line
1438 else:
1439 if col == 0:
1440 print2file(file_id, line, 'w')
1441 else:
1442 print2file(file_id, line, 'a')
1443 else:
1444 result.append(line)
1445
1446 if return_mod == True:
1447 return result
1448 if print_to_STDOUT == False:
1449 os.chdir(curr_dir)
1450
1451 def execute(args):
1452 """
1453 CHECK AND PARSE THE COMMAND LINE STUFF
1454 """
1455
1456 # Checking the arguments, parsed from the shell
1457 ###############################################
1458 name = args.name
1459 structure = args.Cstr
1460
1461 if args.Cseq == "":
1462 sequenceconstraint = "N" * len(structure)
1463 else:
1464 sequenceconstraint = args.Cseq
1465
1466 seed = args.seed
1467
1468
1469 alpha = args.alpha
1470 beta = args.beta
1471 tGC = args.tGC
1472 evaporation_rate = args.ER
1473 struct_correction_term = args.Cstrweight
1474 GC_correction_term = args.Cgcweight
1475 seq_correction_term = args.Cseqweight
1476 colonies = args.noOfColonies
1477 degreeOfSequenceInducement = args.level
1478 file_id = args.output_file
1479 verbose = args.verbose
1480 output_verbose = args.output_verbose
1481
1482 tGCmax = args.tGCmax
1483 tGCvar = args.tGCvar
1484
1485 termination_convergence = args.antsTerConv
1486 convergence_count = args.ConvergenceCount
1487 temperature = args.temperature
1488 reset_limit = args.Resets
1489
1490 improve = args.improve_procedure
1491
1492 ### RNAfold parameterfile
1493 paramFile = args.paramFile
1494
1495 # Using the pkiss program under user changeable parameters
1496 pseudoknots = args.pseudoknots
1497
1498 # Loading the optimized parameters for pseudoknots and ignore user input
1499 if args.pseudoknot_parameters:
1500 alpha = 1.0
1501 beta = 0.1
1502 evaporation_rate = 0.2
1503 struct_correction_term = 0.1
1504 GC_correction_term = 1.0
1505 seq_correction_term = 0.5
1506 termination_convergence = 50
1507 convergence_count = 100
1508
1509
1510 strategy = args.strategy
1511 useGU = args.useGUBasePair
1512
1513 checkForViennaTools()
1514 if pseudoknots:
1515 checkForpKiss()
1516 findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU)
1517
1518
1519 def exe():
1520 """
1521 MAIN EXECUTABLE WHICH PARSES THE INPUT LINE
1522 """
1523
1524 argument_parser = argparse.ArgumentParser(
1525 description = """
1526 Ant Colony Optimized RNA Sequence Design
1527 """,
1528
1529 epilog = """
1530
1531 #########################################################################
1532 # antaRNA - ant assembled RNA #
1533 # -> Ant Colony Optimized RNA Sequence Design #
1534 # ------------------------------------------------------------ #
1535 # Robert Kleinkauf (c) 2015 #
1536 # Bioinformatics, Albert-Ludwigs University Freiburg, Germany #
1537 #########################################################################
1538
1539 - For antaRNA only the VIENNNA RNA Package must be installed on your linux system.
1540 antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are
1541 not installed correctly, no output will be generated, an also no warning will be prompted.
1542 So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly!
1543
1544 - antaRNA was only tested under Linux.
1545
1546 - For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/
1547
1548 Example calls:
1549 python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2
1550 python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov
1551 python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10
1552
1553 #########################################################################
1554 # --- Hail to the Queen!!! All power to the swarm!!! --- #
1555 #########################################################################
1556 """,
1557 #formatter_class=RawTextHelpFormatter
1558 )
1559
1560 # mandatorys
1561 argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True)
1562 argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True)
1563 argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int, required=True)
1564 argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true")
1565
1566 argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none")
1567 argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method. h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s")
1568 argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5)
1569 argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130)
1570 argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50)
1571
1572 argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true")
1573 argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true")
1574 argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A")
1575
1576 # Mutual Exclusiv target GC distribution variables
1577 #tGCgroup = argument_parser.add_mutually_exclusive_group()
1578 argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
1579 argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
1580
1581 argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0)
1582 argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="")
1583 argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT")
1584 argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "")
1585 argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1)
1586 argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_")
1587 argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
1588 argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
1589 argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2)
1590 argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5)
1591 argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0)
1592 argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0)
1593 argument_parser.add_argument("-v", "--verbose", help="Displayes intermediate output.\n\n", action="store_true")
1594 argument_parser.add_argument("-ov", "--output_verbose", help="Prints additional output to the headers of the produced sequences.\n\n", action="store_false")
1595
1596 args = argument_parser.parse_args()
1597
1598 execute(args)
1599
1600 def checkForViennaTools():
1601 """
1602 Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance
1603 """
1604 RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip()
1605 if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1:
1606 return True
1607 else:
1608 print "It seems the Vienna RNA Package is not installed on your machine. Please do so!"
1609 print "You can get it at http://www.tbi.univie.ac.at/"
1610 exit(0)
1611
1612
1613 def checkForpKiss():
1614 """
1615 Checking for the presence of pKiss
1616 """
1617 pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip()
1618 if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1:
1619 return True
1620 else:
1621 print "It seems that pKiss is not installed on your machine. Please do so!"
1622 print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss"
1623 exit(0)
1624
1625
1626
1627 if __name__ == "__main__":
1628
1629 exe()
1630