comparison smRtools.py @ 2:9f17e8fc1d28 draft

Uploaded
author drosofff
date Sun, 22 Jun 2014 18:31:38 -0400
parents
children 59dfb33ca70e
comparison
equal deleted inserted replaced
1:f6c22925fc3c 2:9f17e8fc1d28
1 #!/usr/bin/python
2 # version 1 7-5-2012 unification of the SmRNAwindow class
3
4 import sys, subprocess
5 from collections import defaultdict, OrderedDict
6 from numpy import mean, median, std
7 from scipy import stats
8
9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
10 '''This function will return a dictionary containing fasta identifiers as keys and the
11 sequence as values. Index must be the path to a fasta file.'''
12 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
13 outputlines = p.stdout.readlines()
14 p.wait()
15 item_dic = {}
16 for line in outputlines:
17 if (line[0] == ">"):
18 try:
19 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
20 except: pass
21 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
22 item_dic[current_item] = ""
23 stringlist=[]
24 else:
25 stringlist.append(line.rstrip() )
26 item_dic[current_item] = "".join(stringlist) # for the last item
27 return item_dic
28
29 def get_fasta_headers (index):
30 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
31 outputlines = p.stdout.readlines()
32 p.wait()
33 item_dic = {}
34 for line in outputlines:
35 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
36 item_dic[header] = 1
37 return item_dic
38
39
40 def get_file_sample (file, numberoflines):
41 '''import random to use this function'''
42 F=open(file)
43 fullfile = F.read().splitlines()
44 F.close()
45 if len(fullfile) < numberoflines:
46 return "sample size exceeds file size"
47 return random.sample(fullfile, numberoflines)
48
49 def get_fasta_from_history (file):
50 F = open (file, "r")
51 item_dic = {}
52 for line in F:
53 if (line[0] == ">"):
54 try:
55 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
56 except: pass
57 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
58 item_dic[current_item] = ""
59 stringlist=[]
60 else:
61 stringlist.append(line[:-1])
62 item_dic[current_item] = "".join(stringlist) # for the last item
63 return item_dic
64
65 def antipara (sequence):
66 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
67 revseq = sequence[::-1]
68 return "".join([antidict[i] for i in revseq])
69
70 def RNAtranslate (sequence):
71 return "".join([i if i in "AGCN" else "U" for i in sequence])
72 def DNAtranslate (sequence):
73 return "".join([i if i in "AGCN" else "T" for i in sequence])
74
75 def RNAfold (sequence_list):
76 thestring= "\n".join(sequence_list)
77 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
78 output=p.communicate(thestring)[0]
79 p.wait()
80 output=output.split("\n")
81 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
82 buffer=[]
83 for line in output:
84 if line[0] in ["N","A","T","U","G","C"]:
85 buffer.append(DNAtranslate(line))
86 if line[0] in ["(",".",")"]:
87 fields=line.split("(")
88 energy= fields[-1]
89 energy = energy[:-1] # remove the ) parenthesis
90 energy=float(energy)
91 buffer.append(str(energy))
92 return dict(zip(buffer[::2], buffer[1::2]))
93
94 def extractsubinstance (start, end, instance):
95 ''' Testing whether this can be an function external to the class to save memory'''
96 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
97 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
98 upcoordinate = [i for i in range(start,end+1) if instance.readDict[i] ]
99 downcoordinate = [-i for i in range(start,end+1) if instance.readDict[-i] ]
100 for i in upcoordinate:
101 subinstance.readDict[i]=instance.readDict[i]
102 for i in downcoordinate:
103 subinstance.readDict[i]=instance.readDict[i]
104 return subinstance
105
106 class HandleSmRNAwindows:
107 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined"):
108 self.biosample = biosample
109 self.alignmentFile = alignmentFile
110 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
111 self.genomeRefFile = genomeRefFile
112 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
113 self.alignedReads = 0
114 self.instanceDict = {}
115 if genomeRefFormat == "bowtieIndex":
116 self.itemDict = get_fasta (genomeRefFile)
117 elif genomeRefFormat == "fastaSource":
118 self.itemDict = get_fasta_from_history (genomeRefFile)
119 for item in self.itemDict:
120 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample) # create as many instances as there is items
121 self.readfile()
122
123 def readfile (self) :
124 if self.alignmentFileFormat == "tabular":
125 F = open (self.alignmentFile, "r")
126 for line in F:
127 fields = line.split()
128 polarity = fields[1]
129 gene = fields[2]
130 offset = int(fields[3])
131 size = len (fields[4])
132 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
133 self.alignedReads += 1
134 F.close()
135 elif self.alignmentFileFormat == "sam":
136 F = open (self.alignmentFile, "r")
137 dict = {"0":"+", "16":"-"}
138 for line in F:
139 if line[0]=='@':
140 continue
141 fields = line.split()
142 if fields[2] == "*": continue
143 polarity = dict[fields[1]]
144 gene = fields[2]
145 offset = int(fields[3])
146 size = len (fields[9])
147 self.instanceDict[gene].addread (polarity, offset, size) # sam format is already 1-based coordinates
148 self.alignedReads += 1
149 F.close()
150 elif self.alignmentFileFormat == "bam":
151 import pysam
152 samfile = pysam.Samfile(self.alignmentFile)
153 for read in samfile:
154 if read.tid == -1:
155 continue # filter out unaligned reads
156 if read.is_reverse:
157 polarity="-"
158 else:
159 polarity="+"
160 gene = samfile.getrname(read.tid)
161 offset = read.pos
162 size = read.qlen
163 self.instanceDict[gene].addread (polarity, offset+1, size) # pysam converts coordinates to 0-based (https://media.readthedocs.org/pdf/pysam/latest/pysam.pdf)
164 self.alignedReads += 1
165 return
166
167 def CountFeatures (self, GFF3="path/to/file"):
168 featureDict = defaultdict(int)
169 F = open (GFF3, "r")
170 for line in F:
171 if line[0] == "#": continue
172 fields = line[:-1].split()
173 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
174 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
175 F.close()
176 return featureDict
177
178 class SmRNAwindow:
179
180 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined"):
181 self.biosample = biosample
182 self.sequence = sequence
183 self.gene = gene
184 self.windowoffset = windowoffset
185 self.size = len(sequence)
186 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
187 self.matchedreadsUp = 0
188 self.matchedreadsDown = 0
189
190 def addread (self, polarity, offset, size):
191 '''ATTENTION ATTENTION ATTENTION'''
192 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
193 if polarity == "+":
194 self.readDict[offset].append(size)
195 self.matchedreadsUp += 1
196 else:
197 self.readDict[-(offset + size -1)].append(size)
198 self.matchedreadsDown += 1
199 return
200
201 def barycenter (self, upstream_coord=None, downstream_coord=None):
202 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
203 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
204 upstream_coord = upstream_coord or self.windowoffset
205 downstream_coord = downstream_coord or self.windowoffset+self.size-1
206 window_size = downstream_coord - upstream_coord +1
207 def weigthAverage (TuppleList):
208 weightSum = 0
209 PonderWeightSum = 0
210 for tuple in TuppleList:
211 PonderWeightSum += tuple[0] * tuple[1]
212 weightSum += tuple[1]
213 if weightSum > 0:
214 return PonderWeightSum / float(weightSum)
215 else:
216 return 0
217 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
218 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
219 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
220 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
221 return Fbarycenter, Rbarycenter
222
223 def correlation_mapper (self, reference, window_size):
224 '''to map correlation with a sliding window 26-2-2013'''
225 if window_size > self.size:
226 return []
227 F=open(reference, "r")
228 reference_forward = []
229 reference_reverse = []
230 for line in F:
231 fields=line.split()
232 reference_forward.append(int(float(fields[1])))
233 reference_reverse.append(int(float(fields[2])))
234 F.close()
235 local_object_forward=[]
236 local_object_reverse=[]
237 ## Dict to list for the local object
238 for i in range(1, self.size+1):
239 local_object_forward.append(len(self.readDict[i]))
240 local_object_reverse.append(len(self.readDict[-i]))
241 ## start compiling results by slides
242 results=[]
243 for coordinate in range(self.size - window_size):
244 local_forward=local_object_forward[coordinate:coordinate + window_size]
245 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
246 if sum(local_forward) == 0 or sum(local_reverse) == 0:
247 continue
248 try:
249 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
250 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
251 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
252 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
253 except:
254 pass
255 return results
256
257 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
258 '''refactored 24-12-2013 to save memory and introduce offset filtering
259 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
260 the trick is to pass None and then test
261 polarity parameter can take "both", "forward" or "reverse" as value'''
262 upstream_coord = upstream_coord or self.windowoffset
263 downstream_coord = downstream_coord or self.windowoffset+self.size-1
264 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
265 return self.matchedreadsUp + self.matchedreadsDown
266 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
267 return self.matchedreadsUp
268 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
269 return self.matchedreadsDown
270 n=0
271 if polarity == "both":
272 for offset in xrange(upstream_coord, downstream_coord+1):
273 if self.readDict.has_key(offset):
274 for read in self.readDict[offset]:
275 if (read>=size_inf and read<= size_sup):
276 n += 1
277 if method != "conservative":
278 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
279 if self.readDict.has_key(-offset):
280 for read in self.readDict[-offset]:
281 if (read>=size_inf and read<= size_sup):
282 n += 1
283 if method != "conservative":
284 del self.readDict[-offset]
285 return n
286 elif polarity == "forward":
287 for offset in xrange(upstream_coord, downstream_coord+1):
288 if self.readDict.has_key(offset):
289 for read in self.readDict[offset]:
290 if (read>=size_inf and read<= size_sup):
291 n += 1
292 return n
293 elif polarity == "reverse":
294 for offset in xrange(upstream_coord, downstream_coord+1):
295 if self.readDict.has_key(-offset):
296 for read in self.readDict[-offset]:
297 if (read>=size_inf and read<= size_sup):
298 n += 1
299 return n
300
301 def readsizes (self):
302 '''return a dictionary of number of reads by size (the keys)'''
303 dicsize = {}
304 for offset in self.readDict:
305 for size in self.readDict[offset]:
306 dicsize[size] = dicsize.get(size, 0) + 1
307 return dicsize
308
309 def statsizes (self, upstream_coord=None, downstream_coord=None):
310 ''' migration to memory saving by specifying possible subcoordinates
311 see the readcount method for further discussion'''
312 upstream_coord = upstream_coord or self.windowoffset
313 downstream_coord = downstream_coord or self.windowoffset+self.size-1
314 L = []
315 for offset in self.readDict:
316 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
317 for size in self.readDict[offset]:
318 L.append(size)
319 meansize = mean(L)
320 stdv = std(L)
321 mediansize = median(L)
322 return meansize, mediansize, stdv
323
324 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
325 ''' migration to memory saving by specifying possible subcoordinates
326 see the readcount method for further discussion'''
327 upstream_coord = upstream_coord or self.windowoffset
328 downstream_coord = downstream_coord or self.windowoffset+self.size-1
329 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
330 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
331
332 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
333 ''' migration to memory saving by specifying possible subcoordinates
334 see the readcount method for further discussion. size_scope must be an interable'''
335 upstream_coord = upstream_coord or self.windowoffset
336 downstream_coord = downstream_coord or self.windowoffset+self.size-1
337 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
338 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
339 for offset in self.readDict:
340 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
341 for size in self.readDict[offset]:
342 if size in size_scope:
343 startbase = self.sequence[abs(offset)-self.windowoffset]
344 if offset < 0:
345 startbase = convertDic[startbase]
346 freqDic[startbase] += 1
347 base_sum = float ( sum( freqDic.values()) )
348 if base_sum == 0:
349 return "."
350 else:
351 return freqDic["T"] / base_sum * 100
352
353 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
354 ''' migration to memory saving by specifying possible subcoordinates
355 see the readcount method for further discussion. size_scope must be an interable
356 This method is similar to the Ufreq method but take strandness into account'''
357 upstream_coord = upstream_coord or self.windowoffset
358 downstream_coord = downstream_coord or self.windowoffset+self.size-1
359 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
360 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
361 for offset in self.readDict:
362 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
363 for size in self.readDict[offset]:
364 if size in size_scope:
365 startbase = self.sequence[abs(offset)-self.windowoffset]
366 if offset < 0:
367 startbase = convertDic[startbase]
368 freqDic[startbase+"rev"] += 1
369 else:
370 freqDic[startbase+"for"] += 1
371 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
372 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
373 if forward_sum == 0 and reverse_sum == 0:
374 return ". | ."
375 elif reverse_sum == 0:
376 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
377 elif forward_sum == 0:
378 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
379 else:
380 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
381
382
383 def readplot (self):
384 readmap = {}
385 for offset in self.readDict.keys():
386 readmap[abs(offset)] = ( len(self.readDict[-abs(offset)]) , len(self.readDict[abs(offset)]) )
387 mylist = []
388 for offset in sorted(readmap):
389 if readmap[offset][1] != 0:
390 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
391 if readmap[offset][0] != 0:
392 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
393 return mylist
394
395 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
396 '''This method has not been tested yet 15-11-2013'''
397 upstream_coord = upstream_coord or 1
398 downstream_coord = downstream_coord or self.size
399 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
400 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
401 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
402 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
403 if totalforward > totalreverse:
404 majorcoverage = "forward"
405 for offset in self.readDict.keys():
406 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
407 for read in self.readDict[offset]:
408 for i in xrange(read):
409 try:
410 forORrev_coverage[offset-upstream_coord+1+i] += 1
411 except KeyError:
412 continue # a sense read may span over the downstream limit
413 else:
414 majorcoverage = "reverse"
415 for offset in self.readDict.keys():
416 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
417 for read in self.readDict[offset]:
418 for i in xrange(read):
419 try:
420 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
421 except KeyError:
422 continue # an antisense read may span over the upstream limit
423 output_list = []
424 maximum = max (forORrev_coverage.values()) or 1
425 for n in sorted (forORrev_coverage):
426 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
427 return "\n".join(output_list)
428
429
430 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
431 ''' migration to memory saving by specifying possible subcoordinates
432 see the readcount method for further discussion
433 scope must be a python iterable; scope define the *relative* offset range to be computed'''
434 upstream_coord = upstream_coord or self.windowoffset
435 downstream_coord = downstream_coord or self.windowoffset+self.size-1
436 query_range = range (minquery, maxquery+1)
437 target_range = range (mintarget, maxtarget+1)
438 Query_table = {}
439 Target_table = {}
440 frequency_table = dict ([(i, 0) for i in scope])
441 for offset in self.readDict:
442 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
443 for size in self.readDict[offset]:
444 if size in query_range:
445 Query_table[offset] = Query_table.get(offset, 0) + 1
446 if size in target_range:
447 Target_table[offset] = Target_table.get(offset, 0) + 1
448 for offset in Query_table:
449 for i in scope:
450 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
451 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
452 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
453 if zscore == "yes":
454 z_mean = mean(frequency_table.values() )
455 z_std = std(frequency_table.values() )
456 if z_std == 0:
457 frequency_table = dict([(i,0) for i in frequency_table] )
458 else:
459 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
460 return frequency_table
461
462 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
463 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
464 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
465 upstream_coord = upstream_coord or self.windowoffset
466 downstream_coord = downstream_coord or self.windowoffset+self.size-1
467 query_range = range (minquery, maxquery+1)
468 target_range = range (mintarget, maxtarget+1)
469 Query_table = {}
470 Target_table = {}
471 Total_Query_Numb = 0
472 general_frequency_table = dict ([(i,0) for i in scope])
473 ## filtering the appropriate reads for the study
474 for offset in self.readDict:
475 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
476 for size in self.readDict[offset]:
477 if size in query_range:
478 Query_table[offset] = Query_table.get(offset, 0) + 1
479 Total_Query_Numb += 1
480 if size in target_range:
481 Target_table[offset] = Target_table.get(offset, 0) + 1
482 for offset in Query_table:
483 frequency_table = dict ([(i,0) for i in scope])
484 number_of_targets = 0
485 for i in scope:
486 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
487 number_of_targets += Target_table.get(-offset -i +1, 0)
488 for i in scope:
489 try:
490 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
491 except ZeroDivisionError :
492 continue
493 return general_frequency_table
494
495 def phasing (self, size_range, scope):
496 ''' to calculate autocorelation like signal - scope must be an python iterable'''
497 read_table = {}
498 total_read_number = 0
499 general_frequency_table = dict ([(i, 0) for i in scope])
500 ## read input filtering
501 for offset in self.readDict:
502 for size in self.readDict[offset]:
503 if size in size_range:
504 read_table[offset] = read_table.get(offset, 0) + 1
505 total_read_number += 1
506 ## per offset read phasing computing
507 for offset in read_table:
508 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
509 number_of_targets = 0
510 for i in scope:
511 if offset > 0:
512 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
513 number_of_targets += read_table.get(offset + i, 0)
514 else:
515 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
516 number_of_targets += read_table.get(offset - i, 0)
517 ## inclusion of local frequency table in the general frequency table (all offsets average)
518 for i in scope:
519 try:
520 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
521 except ZeroDivisionError :
522 continue
523 return general_frequency_table
524
525
526
527 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
528 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
529 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
530 z_table = {}
531 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
532 if std(frequency_list):
533 meanlist = mean(frequency_list)
534 stdlist = std(frequency_list)
535 z_list = [(i-meanlist)/stdlist for i in frequency_list]
536 return dict (zip (sorted(frequency_table), z_list) )
537 else:
538 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
539
540 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
541 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
542 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
543 if total == 0:
544 return dict( [(i,0) for i in scope])
545 return dict( [(i, frequency_table[i]/total*100) for i in scope])
546
547 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
548 queryhash = defaultdict(list)
549 targethash = defaultdict(list)
550 query_range = range (int(minquery), int(maxquery)+1)
551 target_range = range (int(mintarget), int(maxtarget)+1)
552 paired_sequences = []
553 for offset in self.readDict: # selection of data
554 for size in self.readDict[offset]:
555 if size in query_range:
556 queryhash[offset].append(size)
557 if size in target_range:
558 targethash[offset].append(size)
559 for offset in queryhash:
560 if offset >= 0: matched_offset = -offset - overlap + 1
561 else: matched_offset = -offset - overlap + 1
562 if targethash[matched_offset]:
563 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
564 if offset >= 0:
565 for i in range (paired):
566 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
567 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
568 if offset < 0:
569 for i in range (paired):
570 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
571 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
572 return paired_sequences
573
574 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
575 queryhash = defaultdict(list)
576 targethash = defaultdict(list)
577 query_range = range (int(minquery), int(maxquery)+1)
578 target_range = range (int(mintarget), int(maxtarget)+1)
579 paired_sequences = []
580
581 for offset in self.readDict: # selection of data
582 for size in self.readDict[offset]:
583 if size in query_range:
584 queryhash[offset].append(size)
585 if size in target_range:
586 targethash[offset].append(size)
587
588 for offset in queryhash:
589 matched_offset = -offset - overlap + 1
590 if targethash[matched_offset]:
591 if offset >= 0:
592 for i in queryhash[offset]:
593 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
594 for i in targethash[matched_offset]:
595 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
596 if offset < 0:
597 for i in queryhash[offset]:
598 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
599 for i in targethash[matched_offset]:
600 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
601 return paired_sequences
602
603 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
604 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
605 queryhash = defaultdict(list)
606 targethash = defaultdict(list)
607 query_range = range (int(minquery), int(maxquery)+1)
608 target_range = range (int(mintarget), int(maxtarget)+1)
609 bowtie_output = []
610
611 for offset in self.readDict: # selection of data
612 for size in self.readDict[offset]:
613 if size in query_range:
614 queryhash[offset].append(size)
615 if size in target_range:
616 targethash[offset].append(size)
617 counter = 0
618 for offset in queryhash:
619 matched_offset = -offset - overlap + 1
620 if targethash[matched_offset]:
621 if offset >= 0:
622 for i in queryhash[offset]:
623 counter += 1
624 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
625 if offset < 0:
626 for i in queryhash[offset]:
627 counter += 1
628 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
629 return bowtie_output
630
631
632 def __main__(bowtie_index_path, bowtie_output_path):
633 sequenceDic = get_fasta (bowtie_index_path)
634 objDic = {}
635 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
636 for line in F:
637 fields = line.split()
638 polarity = fields[1]
639 gene = fields[2]
640 offset = int(fields[3])
641 size = len (fields[4])
642 try:
643 objDic[gene].addread (polarity, offset, size)
644 except KeyError:
645 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
646 objDic[gene].addread (polarity, offset, size)
647 F.close()
648 for gene in objDic:
649 print gene, objDic[gene].pairer(19,19,23,19,23)
650
651 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])