0
|
1 import numpy
|
|
2 import sys
|
|
3 import random
|
|
4 import subprocess
|
|
5 import re
|
|
6 import decimal
|
|
7 import math
|
|
8 import os
|
|
9 import shutil
|
|
10 import time
|
|
11 import types
|
|
12 import argparse
|
|
13 #from argparse import RawTextHelpFormatter
|
|
14
|
|
15 #############################
|
|
16 # FUNCTIONS
|
|
17
|
|
18 def print2file(f, i, m):
|
|
19 """
|
|
20 print content i to file f in mode m
|
|
21 """
|
|
22 line = str(i)
|
|
23 if m == "a":
|
|
24 call = "echo \"" + line + "\" >> " + f
|
|
25 elif m == "w":
|
|
26 call = "echo \"" + line + "\" > " + f
|
|
27 os.system(call)
|
|
28
|
|
29 # checking and correcting the alphabet of the constraint sequence
|
|
30 def checkSequenceConstraint(SC):
|
|
31 """
|
|
32 Checks the Sequence constraint for illegal nucleotide characters
|
|
33 """
|
|
34 out = ""
|
|
35 for c in SC:
|
|
36 c = c.upper()
|
|
37 if c not in "ACGURYSWKMBDHVN":
|
|
38 # and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V":
|
|
39 if c == "T":
|
|
40 c = "U"
|
|
41 else:
|
|
42 print "\tIllegal Character in the constraint sequence!"
|
|
43 print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!"
|
|
44 print "\tA Adenine"
|
|
45 print "\tC Cytosine"
|
|
46 print "\tG Guanine"
|
|
47 print "\tT/U Thymine/Uracil"
|
|
48 print "\tR A or G"
|
|
49 print "\tY C or T/U"
|
|
50 print "\tS G or C"
|
|
51 print "\tW A or T/U"
|
|
52 print "\tK G or T/U"
|
|
53 print "\tM A or C"
|
|
54 print "\tB C or G or T/U"
|
|
55 print "\tD A or G or T/U"
|
|
56 print "\tH A or C or T/U"
|
|
57 print "\tV A or C or G"
|
|
58 print "\tN any base"
|
|
59 exit(0)
|
|
60 out += c
|
|
61 return (1, out)
|
|
62
|
|
63
|
|
64 def transform(seq):
|
|
65 """
|
|
66 Transforms "U" to "T" for the processing is done on DNA alphabet
|
|
67 """
|
|
68 S = ""
|
|
69 for s in seq:
|
|
70 if s == "T":
|
|
71 S += "U"
|
|
72 else:
|
|
73 S += s
|
|
74 return S
|
|
75
|
|
76
|
|
77 def checkSimilarLength(s, SC):
|
|
78 """
|
|
79 Compares sequence and structure constraint length
|
|
80 """
|
|
81 if len(s) == len(SC):
|
|
82 return 1
|
|
83 else:
|
|
84 return 0
|
|
85
|
|
86
|
|
87 def isStructure(s):
|
|
88 """
|
|
89 Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters.
|
|
90 """
|
|
91 returnvalue = 1
|
|
92 for a in range(0,len(s)):
|
|
93 if s[a] not in ".()[]{}<>":
|
|
94 if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
95 returnvalue = 0
|
|
96 return returnvalue
|
|
97
|
|
98
|
|
99 def isBalanced(s):
|
|
100 """
|
|
101 Check if the structure s is of a balanced nature
|
|
102 """
|
|
103
|
|
104 balance = 1
|
|
105 for bracket in ["()", "[]", "{}", "<>"]:
|
|
106 counter = 0
|
|
107 for a in xrange(len(s)):
|
|
108 if s[a] in bracket[0]:
|
|
109 counter += 1
|
|
110 elif s[a] in bracket[1]:
|
|
111 counter -= 1
|
|
112 if counter != 0:
|
|
113 balance = 0
|
|
114 return balance
|
|
115
|
|
116
|
|
117
|
|
118 def fulfillsHairpinRule(s):
|
|
119 """
|
|
120 CHECKING FOR THE 3 nt LOOP INTERSPACE
|
|
121 for all kind of basepairs, even wihtin the pdeudoknots
|
|
122 """
|
|
123
|
|
124 fulfillsRules = 1
|
|
125 for bracket in ["()", "[]", "{}", "<>"]:
|
|
126 last_opening_char = 0
|
|
127 check = 0
|
|
128 for a in xrange(len(s)):
|
|
129 if s[a] == bracket[0]:
|
|
130 last_opening_char = a
|
|
131 check = 1
|
|
132 elif s[a] == bracket[1] and check == 1:
|
|
133 check = 0
|
|
134 if a - last_opening_char < 4:
|
|
135 return 0
|
|
136 return 1
|
|
137
|
|
138
|
|
139 def isValidStructure(s):
|
|
140 """
|
|
141 Checks, if the structure s is a valid structure
|
|
142 """
|
|
143
|
|
144 Structure = isStructure(s)
|
|
145 Balanced = isBalanced(s)
|
|
146 HairpinRule = fulfillsHairpinRule(s)
|
|
147
|
|
148 if Structure == 1 and Balanced == 1 and HairpinRule == 1:
|
|
149 return 1
|
|
150 else:
|
|
151 print Structure, Balanced, HairpinRule
|
|
152 return 0
|
|
153
|
|
154 def loadIUPACcompatibilities(IUPAC, useGU):
|
|
155 """
|
|
156 Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES
|
|
157 """
|
|
158 compatible = {}
|
|
159 for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
|
|
160 sn1 = list(IUPAC[nuc1])
|
|
161 for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
|
|
162 sn2 = list(IUPAC[nuc2])
|
|
163 compatib = 0
|
|
164 for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
|
|
165 for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
|
|
166 # CHECKING THEIR COMPATIBILITY
|
|
167 if useGU == True:
|
|
168 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"):
|
|
169 compatib = 1
|
|
170 else:
|
|
171 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"):
|
|
172 compatib = 1
|
|
173 compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL
|
|
174 return compatible
|
|
175
|
|
176 def isCompatibleToSet(c1, c2, IUPAC_compatibles):
|
|
177 """
|
|
178 Checks compatibility of c1 wihtin c2
|
|
179 """
|
|
180 compatible = True
|
|
181 for setmember in c2:
|
|
182 #print setmember
|
|
183 if isCompatible(c1, setmember, IUPAC_compatibles) == False:
|
|
184 return False
|
|
185 return compatible
|
|
186
|
|
187
|
|
188 def isCompatible(c1, c2, IUPAC_compatibles):
|
|
189 """
|
|
190 Checks compatibility between character c1 and c2
|
|
191 """
|
|
192 if IUPAC_compatibles[c1 + "_" + c2] == 1:
|
|
193 return True
|
|
194 else:
|
|
195 return False
|
|
196
|
|
197
|
|
198 def isStructureCompatible(lp1, lp2 ,bp):
|
|
199 """
|
|
200 Checks, if the region within lp1 and lp2 is structurally balanced
|
|
201 """
|
|
202 x = lp1 + 1
|
|
203 while (x < lp2):
|
|
204 if (bp[x] <= lp1 or bp[x] > lp2):
|
|
205 return False
|
|
206 if x == bp[x]:
|
|
207 x += 1
|
|
208 else:
|
|
209 x = bp[x] + 1
|
|
210 return x == lp2
|
|
211
|
|
212
|
|
213 def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles):
|
|
214 """
|
|
215 Checks if the constraints are compatible to each other
|
|
216 """
|
|
217 returnstring = ""
|
|
218 compatible = 1
|
|
219 for id1 in basepairstack: # key = (constraint , (pos, constraint)))
|
|
220 constr1 = basepairstack[id1][0]
|
|
221 id2 = basepairstack[id1][1][0]
|
|
222 constr2 = basepairstack[id1][1][1]
|
|
223
|
|
224 if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles):
|
|
225
|
|
226 compatible = 0
|
|
227 returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n"
|
|
228 #if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]):
|
|
229
|
|
230 #compatible = 0
|
|
231 #else:
|
|
232 #returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n"
|
|
233 return (compatible, returnstring)
|
|
234
|
|
235
|
|
236 def getLP(BPSTACK):
|
|
237 """
|
|
238 Retreives valid lonley base pairs from a base pair stack
|
|
239 """
|
|
240 #20 ('N', (>BLOCK<, 'N'))
|
|
241
|
|
242 # geting single base pairs
|
|
243 stack = {}
|
|
244 LP = {}
|
|
245 if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
|
|
246 for i in BPSTACK.keys():
|
|
247 #if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
248 stack[i] = int(BPSTACK[i][1][0])
|
|
249 #print i , BPSTACK[i][1][0]
|
|
250 else:
|
|
251 for i in BPSTACK.keys():
|
|
252 #if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
253 stack[i] = BPSTACK[i]
|
|
254
|
|
255 # removing redundant base pair indices
|
|
256 for i in stack.keys():
|
|
257 if i >= stack[i]:
|
|
258 del stack[i]
|
|
259
|
|
260 # actual checking for single lonley base pairs
|
|
261 for i in stack.keys():
|
|
262 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1):
|
|
263 LP[i] = stack[i]
|
|
264
|
|
265 ##actual removal of 2er lonley base pairs
|
|
266 for i in stack.keys():
|
|
267 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2):
|
|
268 LP[i] = stack[i]
|
|
269 LP[i+1] = stack[i+1]
|
|
270
|
|
271
|
|
272 #if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
|
|
273 #for i in BPSTACK.keys():
|
|
274
|
|
275 ##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
276 #stack[i] = int(BPSTACK[i][1][0])
|
|
277 ##print i , BPSTACK[i][1][0]
|
|
278 #else:
|
|
279 #for i in BPSTACK.keys():
|
|
280 ##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
281 #stack[i] = BPSTACK[i]
|
|
282
|
|
283 #for i in stack.keys():
|
|
284 #if i >= stack[i]:
|
|
285 #del stack[i]
|
|
286
|
|
287
|
|
288
|
|
289 return LP
|
|
290
|
|
291
|
|
292 def getBPStack(s, seq):
|
|
293 """
|
|
294 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
|
|
295 """
|
|
296 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
|
|
297 bpstack = {}
|
|
298 for i in xrange(len(s)):
|
|
299
|
|
300 # REGULAR SECONDARY STRUCTURE DETECTION
|
|
301 if s[i] in "(){}[]<>":
|
|
302
|
|
303 no = 0
|
|
304 ### opening
|
|
305 if s[i] in "([{<":
|
|
306 if s[i] == "(":
|
|
307 tmp_stack["()"].append((i, seq[i]))
|
|
308 elif s[i] == "[":
|
|
309 tmp_stack["[]"].append((i, seq[i]))
|
|
310 elif s[i] == "{":
|
|
311 tmp_stack["{}"].append((i, seq[i]))
|
|
312 elif s[i] == "<":
|
|
313 tmp_stack["<>"].append((i, seq[i]))
|
|
314
|
|
315 #closing
|
|
316 elif s[i] in ")]}>":
|
|
317 if s[i] == ")":
|
|
318 no, constr = tmp_stack["()"].pop()
|
|
319 elif s[i] == "]":
|
|
320 no, constr = tmp_stack["[]"].pop()
|
|
321 elif s[i] == "}":
|
|
322 no, constr = tmp_stack["{}"].pop()
|
|
323 elif s[i] == ">":
|
|
324 no, constr = tmp_stack["<>"].pop()
|
|
325 bpstack[no] = (constr, (i, seq[i]))
|
|
326 bpstack[i] = (seq[i] ,(no, constr))
|
|
327
|
|
328 elif s[i] == ".":
|
|
329 bpstack[i] = (seq[i], (i, seq[i]))
|
|
330 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
331 bpstack[i] = (seq[i], (i, seq[i]))
|
|
332
|
|
333 return (bpstack, getLP(bpstack))
|
|
334
|
|
335
|
|
336 def getbpStack(s):
|
|
337 """
|
|
338 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
|
|
339 """
|
|
340 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
|
|
341 bpstack = {}
|
|
342
|
|
343 for i in xrange(len(s)):
|
|
344 if s[i] in "(){}[]<>":
|
|
345
|
|
346 no = 0
|
|
347 ### opening
|
|
348 if s[i] in "([{<":
|
|
349 if s[i] == "(":
|
|
350 tmp_stack["()"].append(i)
|
|
351 elif s[i] == "[":
|
|
352 tmp_stack["[]"].append(i)
|
|
353 elif s[i] == "{":
|
|
354 tmp_stack["{}"].append(i)
|
|
355 elif s[i] == "<":
|
|
356 tmp_stack["<>"].append(i)
|
|
357
|
|
358 #closing
|
|
359 elif s[i] in ")]}>":
|
|
360 if s[i] == ")":
|
|
361 no = tmp_stack["()"].pop()
|
|
362 elif s[i] == "]":
|
|
363 no = tmp_stack["[]"].pop()
|
|
364 elif s[i] == "}":
|
|
365 no = tmp_stack["{}"].pop()
|
|
366 elif s[i] == ">":
|
|
367 no = tmp_stack["<>"].pop()
|
|
368 bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))}
|
|
369 bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))}
|
|
370
|
|
371 elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner....
|
|
372 bpstack[i] = i
|
|
373
|
|
374 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
375 bpstack[i] = i
|
|
376
|
|
377 #elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
378 ## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific
|
|
379 ## block character
|
|
380 #bpstack[i] = s[i]
|
|
381
|
|
382 return (bpstack, getLP(bpstack))
|
|
383
|
|
384 def maprange( a, b, s):
|
|
385 """
|
|
386 Mapping function
|
|
387 """
|
|
388 (a1, a2), (b1, b2) = a, b
|
|
389 return b1 + ((s - a1) * (b2 - b1) / (a2 - a1))
|
|
390
|
|
391
|
|
392 def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt):
|
|
393 """
|
|
394 GC path length contribution calculation.
|
|
395 """
|
|
396 GCadjustment = 1.5
|
|
397 minimum = 0.5
|
|
398 upper = GCadjustment
|
|
399 lower = minimum
|
|
400
|
|
401 if nt == "A" or nt == "U":
|
|
402 pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC)
|
|
403
|
|
404 if nt == "G" or nt == "C":
|
|
405 #pathlength = pathlength * (float(1-tmpGC))
|
|
406 pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC)
|
|
407 return pathlength
|
|
408
|
|
409 def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
|
|
410 """
|
|
411 Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section)
|
|
412 """
|
|
413 # TE :: transition element / path section under dispute
|
|
414 # id1 :: id of the position of the caharacter to which the transition is leading to
|
|
415 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
|
|
416 # val :: BPstack information of the specific position
|
|
417 # constr1 :: constraining character of pos id1
|
|
418 # constr2 :: constraining character of pos id2
|
|
419
|
|
420 id1 = int(TE.split(".")[0])
|
|
421 val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack
|
|
422 constr1 = val[0] # getting the constraint character of position id1
|
|
423 id2 = int(val[1][0]) # getting position id2
|
|
424 constr2 = val[1][1] # getting the sequence constraint for position id2
|
|
425 targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to
|
|
426
|
|
427 c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1
|
|
428 c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2
|
|
429
|
|
430 if targetNucleotide in c1:
|
|
431 if id1 == id2:
|
|
432 return 1
|
|
433 else:
|
|
434 if targetNucleotide in c2:
|
|
435 return 1
|
|
436 else:
|
|
437 return 0
|
|
438 else:
|
|
439 return 0
|
|
440
|
|
441 """
|
|
442 def getConstraint(TE, BPstack):
|
|
443 # TE :: transition element / path section
|
|
444 # id1 :: id of the position of the caharacter to which the transition is leading to
|
|
445 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
|
|
446 # val :: BPstack information of the specific position
|
|
447 # constr1 :: constraining character of pos id1
|
|
448 # constr2 :: constraining character of pos id2
|
|
449
|
|
450 ### BPstack [id1] = (constr1, (id2, constr2))
|
|
451
|
|
452 id1 = TE.split(".")[0]
|
|
453 #print id1
|
|
454 #id1 = TE.find(TE.strip("_")) # strip the path section and getting the position of the section
|
|
455 #if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition
|
|
456 #id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character
|
|
457 val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack
|
|
458 constr1 = val[0] # getting the constraint character of position id1
|
|
459 id2 = val[1][0] # getting position id2
|
|
460 constr2 = val[1][1] # getting the sequence constraint for position id2
|
|
461 #print TE, id1, constr1, id2, constr2,
|
|
462
|
|
463 #TE.split(".")[1][-1:]
|
|
464 if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint
|
|
465 if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1
|
|
466 #print 1
|
|
467 return 1
|
|
468 elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint
|
|
469 #print 1
|
|
470 return 1
|
|
471 else: # single base sequence constraints differ
|
|
472 #print 0
|
|
473 return 0
|
|
474
|
|
475 elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint)
|
|
476 if constr1 == "N" and constr2 == "N": # no sequence constraint
|
|
477 #print 1
|
|
478 return 1
|
|
479 if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases)
|
|
480 if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2
|
|
481 #print 1
|
|
482 return 1
|
|
483 else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2
|
|
484 #print 0
|
|
485 return 0
|
|
486 if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base)
|
|
487 if TE.split(".")[1][-1:] == constr1: # the current path section destination base is as constrained with constr1
|
|
488 #print 1
|
|
489 return 1
|
|
490 else: # the current path section destination base is not as constrained in constr1
|
|
491 #print 0
|
|
492 return 0
|
|
493 if constr1 != "N" and constr2 != "N": # both positions have sequential constraint
|
|
494 if TE.split(".")[1][-1:] == constr1:
|
|
495 #print 1
|
|
496 return 1
|
|
497 else:
|
|
498 #print 0
|
|
499 return 0
|
|
500 """
|
|
501
|
|
502 def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
|
|
503 #nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3}
|
|
504
|
|
505 dels = []
|
|
506 for terrainelement in sorted(terrain):
|
|
507 pheromone, pathlength = terrain[terrainelement]
|
|
508 pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
|
|
509 pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
|
|
510 pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:])
|
|
511 if pheromone * pathlength == 0: dels.append(terrainelement)
|
|
512 terrain[terrainelement] = (pheromone, pathlength,[])
|
|
513 further_dels = {}
|
|
514 for terrainelement in sorted(dels):
|
|
515 pos, nucs = terrainelement.split(".")
|
|
516 if int(pos) < len(s)-1:
|
|
517 to_nt = nucs[-1:]
|
|
518 successor_pos = int(pos) + 1
|
|
519 for i in ["A", "C", "G", "U"]:
|
|
520 del_element = str(successor_pos) + "." + to_nt + i
|
|
521 further_dels[del_element] = 1
|
|
522 further_dels[terrainelement] = 1
|
|
523 # deleting the inbound and outbound edges, which are forbidden
|
|
524 for terrainelement in further_dels:
|
|
525 del terrain[terrainelement]
|
|
526 # allocate the appropriate children of edges
|
|
527 for terrainelement in terrain:
|
|
528 pheromone, pathlength, children = terrain[terrainelement]
|
|
529 pos, nucs = terrainelement.split(".")
|
|
530 if int(pos) < len(s):
|
|
531 to_nt = nucs[-1:]
|
|
532 successor_pos = int(pos) + 1
|
|
533 for i in ["A", "C", "G", "U"]:
|
|
534 if str(successor_pos) + "." + to_nt + i in terrain:
|
|
535 children.append(str(successor_pos) + "." + to_nt + i)
|
|
536 terrain[terrainelement] = (pheromone, pathlength,children)
|
|
537 starts = []
|
|
538 for i in ["A", "C", "G", "U"]:
|
|
539 if str(0) + "." + i in terrain:
|
|
540 starts.append(str(0) + "." + i)
|
|
541 terrain["00.XY"] = (1, 1, starts)
|
|
542 return (terrain, BPstack)
|
|
543
|
|
544
|
|
545 def initTerrain(s): # THE CLASSIC
|
|
546 """
|
|
547 Initialization of the terrain with graph like terrain... vertices are modeled implicitly
|
|
548 """
|
|
549 nt = ["A","C","G","U"]
|
|
550 nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides
|
|
551 e = {}
|
|
552 pathlength = 1
|
|
553 pheromone = 1
|
|
554 for p in xrange(len(s)):
|
|
555 if p == 0:
|
|
556 for i in nt:
|
|
557 e["%s.%s"%(p,i)] = (pheromone, pathlength)
|
|
558 elif p > 0:
|
|
559 for n in nt2:
|
|
560 e["%s.%s"%(p,n)] = (pheromone, pathlength)
|
|
561 return e
|
|
562
|
|
563
|
|
564
|
|
565 def complementBase(c):
|
|
566 """
|
|
567 Returns the complement RNA character of c (without GU base pairs)
|
|
568 """
|
|
569 retChar = ""
|
|
570 if c == "A" :
|
|
571 retChar = "U"
|
|
572 elif c == "U":
|
|
573 retChar = "A"
|
|
574 elif c == "C":
|
|
575 retChar = "G"
|
|
576 elif c == "G":
|
|
577 retChar = "C"
|
|
578 return retChar
|
|
579
|
|
580 def printTerrain(terrain):
|
|
581 #print sorted(terrain.keys())
|
|
582 tmp_i = "0"
|
|
583 tmp_c = 0
|
|
584 terrain = terrain[0]
|
|
585
|
|
586 for a, i in enumerate(sorted(terrain.keys())):
|
|
587 #print a
|
|
588 if i.split(".")[0] != tmp_i:
|
|
589 print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i]
|
|
590
|
|
591 tmp_c = 1
|
|
592 tmp_i = i.split(".")[0]
|
|
593 else:
|
|
594 print i, terrain[i]
|
|
595 tmp_c += 1
|
|
596
|
|
597 print "\nElements:", tmp_c
|
|
598 print "#########################"
|
|
599 print len(terrain)
|
|
600
|
|
601 def pickStep(tmp_steps, summe):
|
|
602 """
|
|
603 Selects a step within the terrain
|
|
604 """
|
|
605 if len(tmp_steps) == 1:
|
|
606 return tmp_steps[0][1] # returning the nucleotide of the only present step
|
|
607 else:
|
|
608 rand = random.random() # draw random number
|
|
609 mainval = 0
|
|
610 for choice in xrange(len(tmp_steps)):
|
|
611 val, label = tmp_steps[choice]
|
|
612 mainval += val/float(summe)
|
|
613 if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done
|
|
614 return label
|
|
615
|
|
616 def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements):
|
|
617 """
|
|
618 Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity
|
|
619 of the base pairs GU, GC and AT
|
|
620 """
|
|
621 nt = ["A","C","G","U"]
|
|
622 prev_edge = "00.XY"
|
|
623 sequence = ""
|
|
624 while len(sequence) < len(s):
|
|
625 coming_from = sequence[-1:]
|
|
626 summe = 0
|
|
627 steps = []
|
|
628 i = len(sequence)
|
|
629 allowed_nt = "ACGU"
|
|
630 # base pair closing case check, with subsequent delivery of a reduced allowed nt set
|
|
631
|
|
632 if i > tmp_BPstack[i][1][0]:
|
|
633 jump = tmp_BPstack[i][1][0]
|
|
634 nuc_at_jump = sequence[jump]
|
|
635 allowed_nt = IUPAC_reverseComplements[nuc_at_jump]
|
|
636
|
|
637 #allowed_nt = complementBase(nuc_at_jump)
|
|
638
|
|
639 # Checking for every possible nt if it is suitable for the selection procedure
|
|
640 for edge in tmp_terrain[prev_edge][-1]:
|
|
641
|
|
642 if edge[-1:] in allowed_nt:
|
|
643 pheromone, PL , children = tmp_terrain[edge]
|
|
644 #if PL > 0:
|
|
645 value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta))
|
|
646 summe += value
|
|
647 steps.append((value, edge))
|
|
648 prev_edge = pickStep(steps, summe)
|
|
649 sequence += prev_edge[-1:]
|
|
650
|
|
651 return sequence
|
|
652
|
|
653
|
|
654 ###
|
|
655 # STRUCTURE PREDICTORS
|
|
656 ###
|
|
657 def getPKStructure(sequence, temperature, mode = "A"):
|
|
658 """
|
|
659 Initialization pKiss mfe pseudoknot prediction
|
|
660 """
|
|
661 p2p = "pKiss"
|
|
662 #p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe"
|
|
663 strategy = "--strategy "
|
|
664 t = "--temperature " + str(temperature)
|
|
665
|
|
666 if mode == "A": strategy += "A"
|
|
667 elif mode == "B": strategy += "B"
|
|
668 elif mode == "C": strategy += "C"
|
|
669 elif mode == "D": strategy += "D"
|
|
670 elif mode == "P": strategy += "P"
|
|
671
|
|
672 p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]),
|
|
673 #shell = True,
|
|
674 stdin = subprocess.PIPE,
|
|
675 stdout = subprocess.PIPE,
|
|
676 stderr = subprocess.PIPE,
|
|
677 close_fds = True)
|
|
678 #print p.stderr.readline()
|
|
679
|
|
680 p.stdin.write(sequence+'\n')
|
|
681 pks = p.communicate()
|
|
682 structure = "".join(pks[0].split("\n")[2].split(" ")[-1:])
|
|
683 return structure
|
|
684
|
|
685 def init_RNAfold(temperature, paramFile = ""):
|
|
686 """
|
|
687 Initialization RNAfold listener
|
|
688 """
|
|
689 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold"
|
|
690 p2p = "RNAfold"
|
|
691
|
|
692 t = "-T " + str(temperature)
|
|
693 P = ""
|
|
694 if paramFile != "":
|
|
695 P = "-P " + paramFile
|
|
696 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
|
|
697 #shell = True,
|
|
698 stdin = subprocess.PIPE,
|
|
699 stdout = subprocess.PIPE,
|
|
700 stderr = subprocess.PIPE,
|
|
701 close_fds = True)
|
|
702 return p
|
|
703
|
|
704
|
|
705 def consult_RNAfold(seq, p):
|
|
706 """
|
|
707 Consults RNAfold listener
|
|
708 """
|
|
709 p.stdin.write(seq+'\n')
|
|
710 out = ""
|
|
711 for i in xrange(2):
|
|
712 out += p.stdout.readline()
|
|
713 return out
|
|
714
|
|
715
|
|
716 def getRNAfoldStructure(struct2, process1):
|
|
717 """
|
|
718 Retrieves folded structure of a RNAfold call
|
|
719 """
|
|
720
|
|
721 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
|
|
722 #RNAdist_pattern = re.compile('.*\s([\d]+)')
|
|
723 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1))
|
|
724 current_structure = ""
|
|
725 #if RNAfold_match:
|
|
726 return RNAfold_match.group(1)
|
|
727
|
|
728
|
|
729 def init_RNAdistance():
|
|
730 """
|
|
731 Initialization of RNAdistance listener
|
|
732 """
|
|
733 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance"
|
|
734 p2p = "RNAdistance"
|
|
735 p = subprocess.Popen( ([p2p]),
|
|
736 #shell = True,
|
|
737 stdin = subprocess.PIPE,
|
|
738 stdout = subprocess.PIPE,
|
|
739 stderr = subprocess.PIPE,
|
|
740 close_fds = True)
|
|
741 return p
|
|
742
|
|
743
|
|
744 def consult_RNAdistance(s1, s2, p):
|
|
745 """
|
|
746 Consulting the RNAdistance listener
|
|
747 """
|
|
748 p.stdin.write(s1+'\n')
|
|
749 p.stdin.write(s2+'\n')
|
|
750 out = ""
|
|
751 out_tmp = p.stdout.readline().strip()
|
|
752 if out_tmp != "":
|
|
753 out += out_tmp
|
|
754 return out
|
|
755
|
|
756 def getInducingSequencePositions(Cseq, degreeOfSequenceInducement):
|
|
757 """
|
|
758 Delimiting the degree of structure inducement by the supplied sequence constraint.
|
|
759 0 : no sequence induced structure constraint
|
|
760 1 : "ACGT" induce structure (explicit nucleotide structure inducement level)
|
|
761 2 : "MWKSYR" and "ACGT" (explicit and double instances)
|
|
762 3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances)
|
|
763 """
|
|
764 setOfNucleotides = "" # resembling the "0"-case
|
|
765 if degreeOfSequenceInducement == 1:
|
|
766 setOfNucleotides = "ACGU"
|
|
767 elif degreeOfSequenceInducement == 2:
|
|
768 setOfNucleotides = "ACGUMWKSYR"
|
|
769 elif degreeOfSequenceInducement == 3:
|
|
770 setOfNucleotides = "ACGUMWKSYRBDHV"
|
|
771 #elif degreeOfSequenceInducement == 4:
|
|
772 #setOfNucleotides = "ACGTMWKSYRBDHVN"
|
|
773
|
|
774 tmpSeq = ""
|
|
775 listset = setOfNucleotides
|
|
776 for pos in Cseq:
|
|
777 if pos not in listset:
|
|
778 tmpSeq += "N"
|
|
779 else:
|
|
780 tmpSeq += pos
|
|
781
|
|
782 return setOfNucleotides, tmpSeq
|
|
783
|
|
784
|
|
785 def getBPDifferenceDistance(stack1, stack2):
|
|
786 """
|
|
787 Based on the not identical amount of base pairs within both structure stacks
|
|
788 """
|
|
789 d = 0
|
|
790 for i in stack1.keys():
|
|
791 # check base pairs in stack 1
|
|
792 if i < stack1[i] and stack1[i] != stack2[i]:
|
|
793 d += 1
|
|
794 # check base pairs in stack 2
|
|
795 for i in stack2.keys():
|
|
796 if i < stack2[i] and stack1[i] != stack2[i]:
|
|
797 d += 1
|
|
798 return d
|
|
799
|
|
800
|
|
801 def getStructuralDistance(target_structure, Cseq, path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy):
|
|
802 """
|
|
803 Calculator for Structural Distance
|
|
804 """
|
|
805 # fold the current solution's sequence to obtain the structure
|
|
806
|
|
807 current_structure = ""
|
|
808
|
|
809 if pseudoknots:
|
|
810 current_structure = getPKStructure(path,strategy)
|
|
811 else:
|
|
812 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold))
|
|
813 current_structure = RNAfold_match.group(1)
|
|
814
|
|
815 # generate the current structure's base-pair stack
|
|
816 bp = getbpStack(current_structure)[0]
|
|
817 # add case-dependend structural constraints in case of lonley basepairs formation
|
|
818 tmp_target_structure_bp = getbpStack(target_structure)[0]
|
|
819
|
|
820 for lp in LP:
|
|
821 if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure.
|
|
822 #tmp_target_structure[lp] = "("
|
|
823 #tmp_target_structure[LP[lp]] = ")"
|
|
824 tmp_target_structure_bp[lp] = LP[lp]
|
|
825 tmp_target_structure_bp[LP[lp]] = lp
|
|
826
|
|
827 # REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure
|
|
828 # check for all allowed implicit constraint block declarators
|
|
829 for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
830 occurances = []
|
|
831 for m in re.finditer(c, target_structure): # search for a declarator in the requested structure
|
|
832 occurances.append(m.start()) # save the corresponding index
|
|
833
|
|
834 # transform declarator into single stranded request
|
|
835 for i in occurances:
|
|
836 #tmp_target_structure[i] = "."
|
|
837 tmp_target_structure_bp[i] = i
|
|
838 # infer a base pair within the block declarated positions, if the current structure provides it.
|
|
839 for i in occurances:
|
|
840 for j in occurances:
|
|
841 if i < j:
|
|
842 if bp[i] == j:
|
|
843 #tmp_target_structure[i] = "("
|
|
844 #tmp_target_structure[bp[i]] = ")"
|
|
845
|
|
846 tmp_target_structure_bp[i] = bp[i]
|
|
847 tmp_target_structure_bp[bp[i]] = i
|
|
848
|
|
849 # CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION
|
|
850 #print "Checking Cseq influence and it's induced basepairs..."
|
|
851 IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement)
|
|
852 if len(Cseq.strip("N")) > 0:
|
|
853 #print "Processing Cseq influence"
|
|
854 # Iterate over all positions within the Base Pair stack
|
|
855 for i in BP: # Check for each base index i
|
|
856
|
|
857 if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution
|
|
858
|
|
859 bp_j = bp[i] # Actual j index of the current solution
|
|
860 BP_j = BP[i][1][0] # j index of the requested structure
|
|
861 if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce...
|
|
862 if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs
|
|
863 #if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible
|
|
864 #tmp_target_structure[i] = "("
|
|
865 #tmp_target_structure[bp_j] = ")"
|
|
866
|
|
867 tmp_target_structure_bp[i] = bp[i]
|
|
868 tmp_target_structure_bp[bp_j] = i
|
|
869
|
|
870 #tts = "".join(tmp_target_structure)
|
|
871 dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp)
|
|
872
|
|
873 # CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed
|
|
874 failLP = 0
|
|
875 for lp in LP:
|
|
876
|
|
877 if bp[lp] != LP[lp]:
|
|
878
|
|
879 isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles)
|
|
880 isStru = isStructureCompatible(lp, LP[lp] ,bp)
|
|
881 if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the
|
|
882 # basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair
|
|
883 failLP += 1
|
|
884
|
|
885 #print dsreg, failLP, float(len(tmp_target_structure_bp))
|
|
886 dsLP = float(failLP)
|
|
887
|
|
888 return (dsreg + dsLP) /float(len(tmp_target_structure_bp))
|
|
889
|
|
890
|
|
891 def getGC(sequence):
|
|
892 """
|
|
893 Calculate GC content of a sequence
|
|
894 """
|
|
895 GC = 0
|
|
896 for nt in sequence:
|
|
897 if nt == "G" or nt == "C":
|
|
898 GC = GC + 1
|
|
899 GC = GC/float(len(sequence))
|
|
900 return GC
|
|
901
|
|
902
|
|
903 def getGCDistance(tGC, gc2, L):
|
|
904 """
|
|
905 Calculate the pseudo GC content distance
|
|
906 """
|
|
907 nt_coeff = L * tGC
|
|
908 pc_nt = (1/float(L))*100
|
|
909 #
|
|
910 d = gc2 - tGC
|
|
911 d = d * 100
|
|
912
|
|
913 f = math.floor(nt_coeff)
|
|
914 c = math.ceil(nt_coeff)
|
|
915
|
|
916 if d < 0: #
|
|
917 #print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt,
|
|
918 d = d + (abs(nt_coeff - f)) * pc_nt
|
|
919 elif d > 0: # case y
|
|
920 #print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt,
|
|
921 d = d - abs(nt_coeff - c) * pc_nt
|
|
922 elif d == 0:
|
|
923 pass
|
|
924
|
|
925 d = round(d, 7)
|
|
926 #d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 )
|
|
927 return abs(d)
|
|
928
|
|
929
|
|
930 def getSequenceEditDistance(SC, path):
|
|
931 """
|
|
932 Calculate sequence edit distance of a solution to the constraint
|
|
933 """#
|
|
934 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
|
|
935 edit = 0
|
|
936 for i in xrange(len(SC)):
|
|
937 if path[i] not in IUPAC[SC[i]]:
|
|
938 edit += 1
|
|
939 return edit/float(len(path))
|
|
940
|
|
941
|
|
942
|
|
943 def getTransitions(p):
|
|
944 """
|
|
945 Retreive transitions of a specific path/sequence
|
|
946 """
|
|
947 transitions = []
|
|
948 for pos in xrange(len(p)):
|
|
949 if pos == 0:
|
|
950 transitions.append(str(pos) + "." + p[pos])
|
|
951
|
|
952 else:
|
|
953 insert = p[pos-1] + p[pos]
|
|
954 transitions.append(str(pos) + "." + insert)
|
|
955
|
|
956 return transitions
|
|
957
|
|
958
|
|
959 def evaporate(t, er):
|
|
960 """
|
|
961 Evaporate the terrain's pheromone trails
|
|
962 """
|
|
963 terr, BP = t
|
|
964 c = 1
|
|
965 for key in terr:
|
|
966 p,l,c = terr[key]
|
|
967 p *= (1-er)
|
|
968 terr[key] = (p, l, c)
|
|
969
|
|
970
|
|
971 def updateValue(distance, correction_term, omega):
|
|
972 """
|
|
973 Retrieves a distance dependend pheromone value
|
|
974 """
|
|
975 if correction_term == 0:
|
|
976 return 0
|
|
977 else:
|
|
978 if distance == 0:
|
|
979 return omega * correction_term
|
|
980 else:
|
|
981 return (1/float(distance)) * correction_term
|
|
982
|
|
983
|
|
984 def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose):
|
|
985 """
|
|
986 Pheromone Update function accorinding to the quality of the solution
|
|
987 """
|
|
988 terr, BP = t
|
|
989 bpstack, LP = getbpStack(c_s)
|
|
990
|
|
991 struct_correction_term , GC_correction_term, seq_correction_term = correction_terms
|
|
992 omega = 2.23
|
|
993
|
|
994 bs = updateValue(ds, struct_correction_term, omega)
|
|
995 bGC = updateValue(dgc, GC_correction_term, omega)
|
|
996 if dseq != "n.a.":
|
|
997 bSeq = updateValue(dseq, seq_correction_term, omega)
|
|
998 d = bs + bGC + bSeq
|
|
999 else:
|
|
1000 d = bs + bGC
|
|
1001 transitions = getTransitions(p)
|
|
1002
|
|
1003 for trans in xrange(len(transitions)): # for each transition in the path
|
|
1004 id1 = int(transitions[trans].split(".")[0])
|
|
1005 tar_id2 = int(BPstack[id1][1][0]) # getting requested position id2
|
|
1006 curr_id2 = int(bpstack[id1]) # getting the current situation
|
|
1007 multiplicator = 0
|
|
1008 if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position
|
|
1009 multiplicator = 1
|
|
1010 elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures
|
|
1011 multiplicator = 1
|
|
1012 p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition
|
|
1013 p += d * multiplicator
|
|
1014 terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's
|
|
1015 t = (terr, BP)
|
|
1016
|
|
1017
|
|
1018 def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count):
|
|
1019 """
|
|
1020 General updating function
|
|
1021 """
|
|
1022 evaporate(t,er)
|
|
1023 trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose)
|
|
1024
|
|
1025
|
|
1026 def getUsedTime(start_time):
|
|
1027 """
|
|
1028 Return the used time between -start time- and now.
|
|
1029 """
|
|
1030 end_time = time.time()
|
|
1031 return end_time - start_time
|
|
1032
|
|
1033
|
|
1034 def good2Go(SC, L, CC, STR):
|
|
1035 """
|
|
1036 Check, if all input is correct and runnable
|
|
1037 """
|
|
1038 if (SC == 1 and L == 1 and CC == 1 and STR == 1):
|
|
1039 return True
|
|
1040 else:
|
|
1041 print SC,L,CC,STR
|
|
1042 return False
|
|
1043
|
|
1044
|
|
1045 def getPathFromSelection( aps, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy):
|
|
1046 """
|
|
1047 Returns the winning path from a selection of pathes...
|
|
1048 """
|
|
1049 terr, BPs = terrain
|
|
1050 win_path = 0
|
|
1051 for i in xrange(aps):
|
|
1052 # Generate Sequence
|
|
1053 path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements)
|
|
1054 # Measure sequence features and transform them into singular distances
|
|
1055 distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
|
|
1056 distance_GC = float(getGCDistance(GC,getGC(path), len(path)))
|
|
1057 distance_seq = float(getSequenceEditDistance(SC, path))
|
|
1058 # Calculate Distance Score
|
|
1059 D = distance_structural + distance_GC + distance_seq
|
|
1060
|
|
1061 # SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score
|
|
1062 if i == 0:
|
|
1063 win_path = (path, D, distance_structural, distance_GC, distance_seq)
|
|
1064 else:
|
|
1065 if D < win_path[1]:
|
|
1066 win_path = (path, D, distance_structural, distance_GC, distance_seq)
|
|
1067 return win_path
|
|
1068
|
|
1069
|
|
1070 def substr(x, string, subst):
|
|
1071 """
|
|
1072 Classical substring function
|
|
1073 """
|
|
1074 s1 = string[:x-1]
|
|
1075
|
|
1076 s2 = string[x-1:x]
|
|
1077 s3 = string[x:]
|
|
1078 #s2 = s[x+len(string)-x-1:]
|
|
1079
|
|
1080 return s1 + subst + s3
|
|
1081
|
|
1082
|
|
1083 def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc):
|
|
1084 """
|
|
1085 Check if a solutions qualities are within the convergence corridor
|
|
1086 """
|
|
1087 struct_var = ((BS_d_struct/float(4)) + 3 ) * 4
|
|
1088 gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1
|
|
1089
|
|
1090 if d_struct <= struct_var and d_gc <= gc_var:
|
|
1091 return True
|
|
1092 else:
|
|
1093 return False
|
|
1094
|
|
1095 def getGCSamplingValue(GC, tGCmax, tGCvar):
|
|
1096 """
|
|
1097 Returns a suitable GC value, dependend on the user input: Either returning the single GC value,
|
|
1098 which the user entered, or a smpled GC value
|
|
1099 from a designated distribution in it's interavals
|
|
1100 """
|
|
1101 returnval = 0
|
|
1102 if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested
|
|
1103 return GC
|
|
1104 elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling
|
|
1105 if GC < tGCmax:
|
|
1106 tmp_GC = tGCmax
|
|
1107 tGCmax = GC
|
|
1108 GC = tmp_GC
|
|
1109 while returnval <= 0:
|
|
1110 returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1))
|
|
1111 return returnval
|
|
1112 elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling
|
|
1113 while returnval <= 0:
|
|
1114 returnval = float(numpy.random.normal(GC, tGCvar, 1))
|
|
1115 return returnval
|
|
1116
|
|
1117
|
|
1118 def reachableGC(C_struct):
|
|
1119 """
|
|
1120 Checks if a demanded GC target content is reachable in dependence with the given sequence constraint.
|
|
1121 """
|
|
1122 AU = 0
|
|
1123 for i in C_struct:
|
|
1124 if i == "A" or i == "U":
|
|
1125 AU += 1
|
|
1126 maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC
|
|
1127 return maxGC
|
|
1128
|
|
1129
|
|
1130 def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy):
|
|
1131 """
|
|
1132 Execution function of a single ant colony finding one solution sequence
|
|
1133 """
|
|
1134 retString = ""
|
|
1135 retString2 = ""
|
|
1136 BPstack, LP = getBPStack(s, SC)
|
|
1137
|
|
1138 rGC = reachableGC(SC)
|
|
1139 GC_message = ""
|
|
1140 if GC > rGC:
|
|
1141 print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC)
|
|
1142 GC = rGC
|
|
1143
|
|
1144 # Initial Constraint Checks prior to execution
|
|
1145 STR = isValidStructure(s)
|
|
1146 START_SC , SC = checkSequenceConstraint(str(SC))
|
|
1147 START_LENGTH = checkSimilarLength(str(s), str(SC))
|
|
1148 START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles)
|
|
1149
|
|
1150 g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR)
|
|
1151 if (g2g == 1):
|
|
1152 start_time = time.time()
|
|
1153 max_time = 600 # seconds
|
|
1154
|
|
1155
|
|
1156
|
|
1157
|
|
1158 ####
|
|
1159 # INITIALIZATION OF THE RNA TOOLs
|
|
1160 #
|
|
1161 RNAfold = init_RNAfold(temperature, paramFile)
|
|
1162 #RNAdistance = init_RNAdistance()
|
|
1163 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
|
|
1164 #RNAdist_pattern = re.compile('.*\s([\d]+)')
|
|
1165 #
|
|
1166 ####
|
|
1167
|
|
1168 terrain = initTerrain(s)
|
|
1169 #print len(terrain),
|
|
1170 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
|
|
1171 #print len(terrain[0])
|
|
1172 #printTerrain(terrain)
|
|
1173 #exit(0)
|
|
1174 global_ant_count = 0
|
|
1175 global_best_ants = 0
|
|
1176 criterion = False
|
|
1177 met = True
|
|
1178 ant_no = 1
|
|
1179 prev_res = 0
|
|
1180 seq = ""
|
|
1181
|
|
1182 counter = 0
|
|
1183
|
|
1184 dstruct_log = []
|
|
1185 dGC_log = []
|
|
1186
|
|
1187
|
|
1188 distance_structural = 1000
|
|
1189 distance_GC = 1000
|
|
1190 distance_seq = 1000
|
|
1191
|
|
1192 convergence = convergence_count
|
|
1193 convergence_counter = 0
|
|
1194
|
|
1195 resets = 0
|
|
1196
|
|
1197 path = ""
|
|
1198 curr_structure = ""
|
|
1199
|
|
1200 Dscore = 100000
|
|
1201 distance_structural = 10000
|
|
1202 distance_GC = 10000
|
|
1203 distance_seq = 10000
|
|
1204 best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1205 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1206
|
|
1207 best_solution_since = 0
|
|
1208
|
|
1209 ants_per_selection = 10
|
|
1210 if len(LP) > 0 :
|
|
1211 for lp in LP:
|
|
1212 s = substr(lp + 1, s, ".")
|
|
1213 s = substr(LP[lp] + 1, s, ".")
|
|
1214
|
|
1215 init = 1
|
|
1216 while criterion != met and getUsedTime(start_time) < max_time:
|
|
1217 iteration_start = time.time()
|
|
1218 global_ant_count += 1
|
|
1219 global_best_ants += 1
|
|
1220
|
|
1221 path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy)
|
|
1222
|
|
1223 distance_structural_prev = distance_structural
|
|
1224 distance_GC_prev = distance_GC
|
|
1225 distance_seq_prev = distance_seq
|
|
1226
|
|
1227 path, Dscore , distance_structural, distance_GC, distance_seq = path_info
|
|
1228 curr_structure = ""
|
|
1229 if pseudoknots:
|
|
1230 curr_structure = getPKStructure(path, strategy)
|
|
1231 else:
|
|
1232 curr_structure = getRNAfoldStructure(path, RNAfold)
|
|
1233
|
|
1234 curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1235 # BEST SOLUTION PICKING
|
|
1236 if improve == "h": # hierarchical check
|
|
1237 # for the global best solution
|
|
1238 if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]):
|
|
1239 best_solution = curr_solution
|
|
1240 ant_no = 1
|
|
1241 # for the local (reset) best solution
|
|
1242 if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]):
|
|
1243 best_solution_local = curr_solution
|
|
1244
|
|
1245 elif improve == "s": #score based check
|
|
1246 # store best global solution
|
|
1247 if Dscore < best_solution[2]:
|
|
1248 best_solution = curr_solution
|
|
1249 ant_no = 1
|
|
1250 # store best local solution for this reset
|
|
1251 if Dscore < best_solution_local[2]:
|
|
1252 best_solution_local = curr_solution
|
|
1253
|
|
1254 # OLD ' BEST SOLUTION ' PICKING
|
|
1255 # if Dscore < best_solution[2]:
|
|
1256 # best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1257 #
|
|
1258 # if Dscore < best_solution_local[2]:
|
|
1259 # best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1260
|
|
1261
|
|
1262 distance_DN = 0
|
|
1263
|
|
1264 if verbose:
|
|
1265 print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection) + " cc " + str(convergence_counter)
|
|
1266
|
|
1267 print s, " <- target struct"
|
|
1268 print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0])
|
|
1269 print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats"
|
|
1270 print curr_structure, " <- CS"
|
|
1271 print path,
|
|
1272 print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq
|
|
1273
|
|
1274 #### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION
|
|
1275 updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count)
|
|
1276 ####
|
|
1277 if verbose: print "Used time for one iteration", time.time() - iteration_start
|
|
1278
|
|
1279
|
|
1280 # CONVERGENCE AND TERMINATION CRITERION MANAGEMENT
|
|
1281 #print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4]
|
|
1282 if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]):
|
|
1283 convergence_counter += 1
|
|
1284 if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC:
|
|
1285 convergence_counter += 1
|
|
1286
|
|
1287 if best_solution[3] == objective_to_target_distance:
|
|
1288 if best_solution[4] == 0.0:
|
|
1289 break
|
|
1290 ant_no = ant_no + 1
|
|
1291 convergence_counter -= 1
|
|
1292 else:
|
|
1293 ant_no = 1
|
|
1294
|
|
1295
|
|
1296 if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5:
|
|
1297 break
|
|
1298
|
|
1299 # RESET
|
|
1300 if ant_no < termination_convergence and convergence_counter >= convergence:
|
|
1301
|
|
1302 terrain = initTerrain(s)
|
|
1303 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
|
|
1304 criterion = False
|
|
1305 met = True
|
|
1306 ant_no = 1
|
|
1307 prev_res = 0
|
|
1308 pre_path = "_" * len(s)
|
|
1309 path = ""
|
|
1310 curr_structure = ""
|
|
1311 counter = 0
|
|
1312 Dscore = 100000
|
|
1313 distance_structural = 1000
|
|
1314 distance_GC = 1000
|
|
1315 distance_seq = 1000
|
|
1316 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
|
|
1317 convergence = convergence_count
|
|
1318 convergence_counter = 0
|
|
1319
|
|
1320 if resets == 0:
|
|
1321 sentinel_solution = best_solution
|
|
1322 best_solution_since += 1
|
|
1323 else:
|
|
1324 if best_solution[2] < sentinel_solution[2]:
|
|
1325 sentinel_solution = best_solution
|
|
1326 best_solution_since = 0
|
|
1327 else:
|
|
1328 best_solution_since += 1
|
|
1329
|
|
1330 resets += 1
|
|
1331
|
|
1332 duration = getUsedTime(start_time)
|
|
1333
|
|
1334 retString += "|Ants:" + str(global_ant_count)
|
|
1335 retString += "|Resets:" + str(resets) + "/" + str(reset_limit)
|
|
1336 retString += "|AntsTC:" + str(termination_convergence)
|
|
1337 retString += "|CC:" + str(convergence_count)
|
|
1338 retString += "|IP:" + str(improve)
|
|
1339 retString += "|BSS:" + str(best_solution_since)
|
|
1340 #if GC_message != "":
|
|
1341 # retString += GC_message + "\n"
|
|
1342
|
|
1343 sequence = best_solution[0]
|
|
1344 struct = best_solution[1]
|
|
1345
|
|
1346 retString += "|LP:" + str(len(LP))
|
|
1347 retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
|
|
1348 retString += "|dGC:" + str(best_solution[4])
|
|
1349 retString += "|GC:" + str(getGC(sequence)*100)
|
|
1350 retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence))
|
|
1351 retString += "|L:" + str(len(sequence))
|
|
1352 retString += "|Time:" + str(duration)
|
|
1353 retString2 += "\n" + struct + "\n"
|
|
1354 retString2 += sequence
|
|
1355
|
|
1356 # CLOSING THE PIPES TO THE PROGRAMS
|
|
1357 RNAfold.communicate()
|
|
1358 #RNAdistance.communicate()
|
|
1359
|
|
1360 else: # Structural premisses are not met, htherefore the program will halt with a failure message
|
|
1361 retString += "\nSome mistake detected\n"
|
|
1362 retString += "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " + str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n"
|
|
1363
|
|
1364 return (retString, retString2)
|
|
1365
|
|
1366 def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False):
|
|
1367 """
|
|
1368 MAIN antaRNA - ant assembled RNA
|
|
1369 """
|
|
1370
|
|
1371 if seed != "none":
|
|
1372 random.seed(seed)
|
|
1373
|
|
1374 if Cseq == "":
|
|
1375 sequenceconstraint = "N" * len(structure)
|
|
1376 else:
|
|
1377 sequenceconstraint = str(Cseq)
|
|
1378
|
|
1379 alpha = float(alpha)
|
|
1380 beta = float(beta)
|
|
1381 tGC = float(tGC)
|
|
1382 evaporation_rate = float(evaporation_rate)
|
|
1383 struct_correction_term = float(struct_correction_term)
|
|
1384 GC_correction_term = float(GC_correction_term)
|
|
1385 seq_correction_term = float(seq_correction_term)
|
|
1386 colonies = int(colonies)
|
|
1387 file_id = str(file_id)
|
|
1388 verbose = verbose
|
|
1389 output_verbose = output_verbose
|
|
1390 correction_terms = struct_correction_term, GC_correction_term, seq_correction_term
|
|
1391 temperature = float(temperature)
|
|
1392 print_to_STDOUT = (file_id == "STDOUT")
|
|
1393
|
|
1394 useGU = useGU
|
|
1395
|
|
1396 if return_mod == False:
|
|
1397 if print_to_STDOUT == False:
|
|
1398 outfolder = '/'.join(file_id.strip().split("/")[:-1])
|
|
1399 curr_dir = os.getcwd()
|
|
1400 if not os.path.exists(outfolder):
|
|
1401 os.makedirs(outfolder)
|
|
1402 os.chdir(outfolder)
|
|
1403
|
|
1404
|
|
1405 sequenceconstraint = transform(sequenceconstraint)
|
|
1406 ###############################################
|
|
1407
|
|
1408 # Allowed deviation from the structural target:
|
|
1409 objective_to_target_distance = 0.0
|
|
1410
|
|
1411 # Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols
|
|
1412 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
|
|
1413 IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU)
|
|
1414
|
|
1415 IUPAC_reverseComplements = {}
|
|
1416 if useGU == False: ## Without the GU basepair
|
|
1417 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"}
|
|
1418 else: ## allowing the GU basepair
|
|
1419 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"}
|
|
1420
|
|
1421 result = []
|
|
1422 for col in xrange(colonies):
|
|
1423 # Checking the kind of taget GC value should be used
|
|
1424 GC = getGCSamplingValue(tGC, tGCmax, tGCvar)
|
|
1425
|
|
1426 # Actual execution of a ant colony procesdure
|
|
1427 output_v, output_w = runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy)
|
|
1428
|
|
1429 # Post-Processing the output of a ant colony procedure
|
|
1430 line = ">" + name + str(col)
|
|
1431 if output_verbose:
|
|
1432 line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" + str(GC) + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + output_w
|
|
1433 else:
|
|
1434 line += output_w
|
|
1435 if return_mod == False:
|
|
1436 if print_to_STDOUT:
|
|
1437 print line
|
|
1438 else:
|
|
1439 if col == 0:
|
|
1440 print2file(file_id, line, 'w')
|
|
1441 else:
|
|
1442 print2file(file_id, line, 'a')
|
|
1443 else:
|
|
1444 result.append(line)
|
|
1445
|
|
1446 if return_mod == True:
|
|
1447 return result
|
|
1448 if print_to_STDOUT == False:
|
|
1449 os.chdir(curr_dir)
|
|
1450
|
|
1451 def execute(args):
|
|
1452 """
|
|
1453 CHECK AND PARSE THE COMMAND LINE STUFF
|
|
1454 """
|
|
1455
|
|
1456 # Checking the arguments, parsed from the shell
|
|
1457 ###############################################
|
|
1458 name = args.name
|
|
1459 structure = args.Cstr
|
|
1460
|
|
1461 if args.Cseq == "":
|
|
1462 sequenceconstraint = "N" * len(structure)
|
|
1463 else:
|
|
1464 sequenceconstraint = args.Cseq
|
|
1465
|
|
1466 seed = args.seed
|
|
1467
|
|
1468
|
|
1469 alpha = args.alpha
|
|
1470 beta = args.beta
|
|
1471 tGC = args.tGC
|
|
1472 evaporation_rate = args.ER
|
|
1473 struct_correction_term = args.Cstrweight
|
|
1474 GC_correction_term = args.Cgcweight
|
|
1475 seq_correction_term = args.Cseqweight
|
|
1476 colonies = args.noOfColonies
|
|
1477 degreeOfSequenceInducement = args.level
|
|
1478 file_id = args.output_file
|
|
1479 verbose = args.verbose
|
|
1480 output_verbose = args.output_verbose
|
|
1481
|
|
1482 tGCmax = args.tGCmax
|
|
1483 tGCvar = args.tGCvar
|
|
1484
|
|
1485 termination_convergence = args.antsTerConv
|
|
1486 convergence_count = args.ConvergenceCount
|
|
1487 temperature = args.temperature
|
|
1488 reset_limit = args.Resets
|
|
1489
|
|
1490 improve = args.improve_procedure
|
|
1491
|
|
1492 ### RNAfold parameterfile
|
|
1493 paramFile = args.paramFile
|
|
1494
|
|
1495 # Using the pkiss program under user changeable parameters
|
|
1496 pseudoknots = args.pseudoknots
|
|
1497
|
|
1498 # Loading the optimized parameters for pseudoknots and ignore user input
|
|
1499 if args.pseudoknot_parameters:
|
|
1500 alpha = 1.0
|
|
1501 beta = 0.1
|
|
1502 evaporation_rate = 0.2
|
|
1503 struct_correction_term = 0.1
|
|
1504 GC_correction_term = 1.0
|
|
1505 seq_correction_term = 0.5
|
|
1506 termination_convergence = 50
|
|
1507 convergence_count = 100
|
|
1508
|
|
1509
|
|
1510 strategy = args.strategy
|
|
1511 useGU = args.useGUBasePair
|
|
1512
|
|
1513 checkForViennaTools()
|
|
1514 if pseudoknots:
|
|
1515 checkForpKiss()
|
|
1516 findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU)
|
|
1517
|
|
1518
|
|
1519 def exe():
|
|
1520 """
|
|
1521 MAIN EXECUTABLE WHICH PARSES THE INPUT LINE
|
|
1522 """
|
|
1523
|
|
1524 argument_parser = argparse.ArgumentParser(
|
|
1525 description = """
|
|
1526 Ant Colony Optimized RNA Sequence Design
|
|
1527 """,
|
|
1528
|
|
1529 epilog = """
|
|
1530
|
|
1531 #########################################################################
|
|
1532 # antaRNA - ant assembled RNA #
|
|
1533 # -> Ant Colony Optimized RNA Sequence Design #
|
|
1534 # ------------------------------------------------------------ #
|
|
1535 # Robert Kleinkauf (c) 2015 #
|
|
1536 # Bioinformatics, Albert-Ludwigs University Freiburg, Germany #
|
|
1537 #########################################################################
|
|
1538
|
|
1539 - For antaRNA only the VIENNNA RNA Package must be installed on your linux system.
|
|
1540 antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are
|
|
1541 not installed correctly, no output will be generated, an also no warning will be prompted.
|
|
1542 So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly!
|
|
1543
|
|
1544 - antaRNA was only tested under Linux.
|
|
1545
|
|
1546 - For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/
|
|
1547
|
|
1548 Example calls:
|
|
1549 python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2
|
|
1550 python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov
|
|
1551 python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10
|
|
1552
|
|
1553 #########################################################################
|
|
1554 # --- Hail to the Queen!!! All power to the swarm!!! --- #
|
|
1555 #########################################################################
|
|
1556 """,
|
|
1557 #formatter_class=RawTextHelpFormatter
|
|
1558 )
|
|
1559
|
|
1560 # mandatorys
|
|
1561 argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True)
|
|
1562 argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True)
|
|
1563 argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int, required=True)
|
|
1564 argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true")
|
|
1565
|
|
1566 argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none")
|
|
1567 argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method. h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s")
|
|
1568 argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5)
|
|
1569 argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130)
|
|
1570 argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50)
|
|
1571
|
|
1572 argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true")
|
|
1573 argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true")
|
|
1574 argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A")
|
|
1575
|
|
1576 # Mutual Exclusiv target GC distribution variables
|
|
1577 #tGCgroup = argument_parser.add_mutually_exclusive_group()
|
|
1578 argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
|
|
1579 argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
|
|
1580
|
|
1581 argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0)
|
|
1582 argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="")
|
|
1583 argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT")
|
|
1584 argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "")
|
|
1585 argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1)
|
|
1586 argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_")
|
|
1587 argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
|
|
1588 argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
|
|
1589 argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2)
|
|
1590 argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5)
|
|
1591 argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0)
|
|
1592 argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0)
|
|
1593 argument_parser.add_argument("-v", "--verbose", help="Displayes intermediate output.\n\n", action="store_true")
|
|
1594 argument_parser.add_argument("-ov", "--output_verbose", help="Prints additional output to the headers of the produced sequences.\n\n", action="store_false")
|
|
1595
|
|
1596 args = argument_parser.parse_args()
|
|
1597
|
|
1598 execute(args)
|
|
1599
|
|
1600 def checkForViennaTools():
|
|
1601 """
|
|
1602 Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance
|
|
1603 """
|
|
1604 RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip()
|
|
1605 if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1:
|
|
1606 return True
|
|
1607 else:
|
|
1608 print "It seems the Vienna RNA Package is not installed on your machine. Please do so!"
|
|
1609 print "You can get it at http://www.tbi.univie.ac.at/"
|
|
1610 exit(0)
|
|
1611
|
|
1612
|
|
1613 def checkForpKiss():
|
|
1614 """
|
|
1615 Checking for the presence of pKiss
|
|
1616 """
|
|
1617 pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip()
|
|
1618 if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1:
|
|
1619 return True
|
|
1620 else:
|
|
1621 print "It seems that pKiss is not installed on your machine. Please do so!"
|
|
1622 print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss"
|
|
1623 exit(0)
|
|
1624
|
|
1625
|
|
1626
|
|
1627 if __name__ == "__main__":
|
|
1628
|
|
1629 exe()
|
|
1630
|