comparison smRtools.py @ 7:20b8ff9c1cb9 draft default tip

Uploaded
author drosofff
date Mon, 23 Jun 2014 05:24:28 -0400
parents
children
comparison
equal deleted inserted replaced
6:11d042a0fbdf 7:20b8ff9c1cb9
1 #!/usr/bin/python
2 # version 1 7-5-2012 unification of the SmRNAwindow class
3
4 import sys, subprocess
5 from collections import defaultdict, OrderedDict
6 from numpy import mean, median, std
7 from scipy import stats
8
9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
10 '''This function will return a dictionary containing fasta identifiers as keys and the
11 sequence as values. Index must be the path to a fasta file.'''
12 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
13 outputlines = p.stdout.readlines()
14 p.wait()
15 item_dic = {}
16 for line in outputlines:
17 if (line[0] == ">"):
18 try:
19 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
20 except: pass
21 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
22 item_dic[current_item] = ""
23 stringlist=[]
24 else:
25 stringlist.append(line.rstrip() )
26 item_dic[current_item] = "".join(stringlist) # for the last item
27 return item_dic
28
29 def get_fasta_headers (index):
30 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
31 outputlines = p.stdout.readlines()
32 p.wait()
33 item_dic = {}
34 for line in outputlines:
35 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
36 item_dic[header] = 1
37 return item_dic
38
39
40 def get_file_sample (file, numberoflines):
41 '''import random to use this function'''
42 F=open(file)
43 fullfile = F.read().splitlines()
44 F.close()
45 if len(fullfile) < numberoflines:
46 return "sample size exceeds file size"
47 return random.sample(fullfile, numberoflines)
48
49 def get_fasta_from_history (file):
50 F = open (file, "r")
51 item_dic = {}
52 for line in F:
53 if (line[0] == ">"):
54 try:
55 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
56 except: pass
57 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
58 item_dic[current_item] = ""
59 stringlist=[]
60 else:
61 stringlist.append(line[:-1])
62 item_dic[current_item] = "".join(stringlist) # for the last item
63 return item_dic
64
65 def antipara (sequence):
66 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
67 revseq = sequence[::-1]
68 return "".join([antidict[i] for i in revseq])
69
70 def RNAtranslate (sequence):
71 return "".join([i if i in "AGCN" else "U" for i in sequence])
72 def DNAtranslate (sequence):
73 return "".join([i if i in "AGCN" else "T" for i in sequence])
74
75 def RNAfold (sequence_list):
76 thestring= "\n".join(sequence_list)
77 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
78 output=p.communicate(thestring)[0]
79 p.wait()
80 output=output.split("\n")
81 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
82 buffer=[]
83 for line in output:
84 if line[0] in ["N","A","T","U","G","C"]:
85 buffer.append(DNAtranslate(line))
86 if line[0] in ["(",".",")"]:
87 fields=line.split("(")
88 energy= fields[-1]
89 energy = energy[:-1] # remove the ) parenthesis
90 energy=float(energy)
91 buffer.append(str(energy))
92 return dict(zip(buffer[::2], buffer[1::2]))
93
94 def extractsubinstance (start, end, instance):
95 ''' Testing whether this can be an function external to the class to save memory'''
96 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
97 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
98 upcoordinate = [i for i in range(start,end+1) if instance.readDict[i] ]
99 downcoordinate = [-i for i in range(start,end+1) if instance.readDict[-i] ]
100 for i in upcoordinate:
101 subinstance.readDict[i]=instance.readDict[i]
102 for i in downcoordinate:
103 subinstance.readDict[i]=instance.readDict[i]
104 return subinstance
105
106 class HandleSmRNAwindows:
107 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined"):
108 self.biosample = biosample
109 self.alignmentFile = alignmentFile
110 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
111 self.genomeRefFile = genomeRefFile
112 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
113 self.alignedReads = 0
114 self.instanceDict = {}
115 if genomeRefFormat == "bowtieIndex":
116 self.itemDict = get_fasta (genomeRefFile)
117 elif genomeRefFormat == "fastaSource":
118 self.itemDict = get_fasta_from_history (genomeRefFile)
119 for item in self.itemDict:
120 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample) # create as many instances as there is items
121 self.readfile()
122
123 def readfile (self) :
124 if self.alignmentFileFormat == "tabular":
125 F = open (self.alignmentFile, "r")
126 for line in F:
127 fields = line.split()
128 polarity = fields[1]
129 gene = fields[2]
130 offset = int(fields[3])
131 size = len (fields[4])
132 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
133 self.alignedReads += 1
134 F.close()
135 elif self.alignmentFileFormat == "sam":
136 F = open (self.alignmentFile, "r")
137 dict = {"0":"+", "16":"-"}
138 for line in F:
139 if line[0]=='@':
140 continue
141 fields = line.split()
142 if fields[2] == "*": continue
143 polarity = dict[fields[1]]
144 gene = fields[2]
145 offset = int(fields[3])
146 size = len (fields[9])
147 self.instanceDict[gene].addread (polarity, offset, size) # sam format is already 1-based coordinates
148 self.alignedReads += 1
149 F.close()
150 elif self.alignmentFileFormat == "bam":
151 import pysam
152 samfile = pysam.Samfile(self.alignmentFile)
153 for read in samfile:
154 if read.tid == -1:
155 continue # filter out unaligned reads
156 if read.is_reverse:
157 polarity="-"
158 else:
159 polarity="+"
160 gene = samfile.getrname(read.tid)
161 offset = read.pos
162 size = read.qlen
163 self.instanceDict[gene].addread (polarity, offset+1, size) # pysam converts coordinates to 0-based (https://media.readthedocs.org/pdf/pysam/latest/pysam.pdf)
164 self.alignedReads += 1
165 return
166
167 def CountFeatures (self, GFF3="path/to/file"):
168 featureDict = defaultdict(int)
169 F = open (GFF3, "r")
170 for line in F:
171 if line[0] == "#": continue
172 fields = line[:-1].split()
173 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
174 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
175 F.close()
176 return featureDict
177
178 class SmRNAwindow:
179
180 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined"):
181 self.biosample = biosample
182 self.sequence = sequence
183 self.gene = gene
184 self.windowoffset = windowoffset
185 self.size = len(sequence)
186 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
187 self.matchedreadsUp = 0
188 self.matchedreadsDown = 0
189
190 def addread (self, polarity, offset, size):
191 '''ATTENTION ATTENTION ATTENTION'''
192 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
193 if polarity == "+":
194 self.readDict[offset].append(size)
195 self.matchedreadsUp += 1
196 else:
197 self.readDict[-(offset + size -1)].append(size)
198 self.matchedreadsDown += 1
199 return
200
201 def barycenter (self, upstream_coord=None, downstream_coord=None):
202 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
203 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
204 upstream_coord = upstream_coord or self.windowoffset
205 downstream_coord = downstream_coord or self.windowoffset+self.size-1
206 window_size = downstream_coord - upstream_coord +1
207 def weigthAverage (TuppleList):
208 weightSum = 0
209 PonderWeightSum = 0
210 for tuple in TuppleList:
211 PonderWeightSum += tuple[0] * tuple[1]
212 weightSum += tuple[1]
213 if weightSum > 0:
214 return PonderWeightSum / float(weightSum)
215 else:
216 return 0
217 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
218 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
219 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
220 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
221 return Fbarycenter, Rbarycenter
222
223 def correlation_mapper (self, reference, window_size):
224 '''to map correlation with a sliding window 26-2-2013'''
225 if window_size > self.size:
226 return []
227 F=open(reference, "r")
228 reference_forward = []
229 reference_reverse = []
230 for line in F:
231 fields=line.split()
232 reference_forward.append(int(float(fields[1])))
233 reference_reverse.append(int(float(fields[2])))
234 F.close()
235 local_object_forward=[]
236 local_object_reverse=[]
237 ## Dict to list for the local object
238 for i in range(1, self.size+1):
239 local_object_forward.append(len(self.readDict[i]))
240 local_object_reverse.append(len(self.readDict[-i]))
241 ## start compiling results by slides
242 results=[]
243 for coordinate in range(self.size - window_size):
244 local_forward=local_object_forward[coordinate:coordinate + window_size]
245 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
246 if sum(local_forward) == 0 or sum(local_reverse) == 0:
247 continue
248 try:
249 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
250 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
251 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
252 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
253 except:
254 pass
255 return results
256
257 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
258 '''refactored 24-12-2013 to save memory and introduce offset filtering
259 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
260 the trick is to pass None and then test
261 polarity parameter can take "both", "forward" or "reverse" as value'''
262 upstream_coord = upstream_coord or self.windowoffset
263 downstream_coord = downstream_coord or self.windowoffset+self.size-1
264 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
265 return self.matchedreadsUp + self.matchedreadsDown
266 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
267 return self.matchedreadsUp
268 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
269 return self.matchedreadsDown
270 n=0
271 if polarity == "both":
272 for offset in xrange(upstream_coord, downstream_coord+1):
273 if self.readDict.has_key(offset):
274 for read in self.readDict[offset]:
275 if (read>=size_inf and read<= size_sup):
276 n += 1
277 if method != "conservative":
278 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
279 if self.readDict.has_key(-offset):
280 for read in self.readDict[-offset]:
281 if (read>=size_inf and read<= size_sup):
282 n += 1
283 if method != "conservative":
284 del self.readDict[-offset]
285 return n
286 elif polarity == "forward":
287 for offset in xrange(upstream_coord, downstream_coord+1):
288 if self.readDict.has_key(offset):
289 for read in self.readDict[offset]:
290 if (read>=size_inf and read<= size_sup):
291 n += 1
292 return n
293 elif polarity == "reverse":
294 for offset in xrange(upstream_coord, downstream_coord+1):
295 if self.readDict.has_key(-offset):
296 for read in self.readDict[-offset]:
297 if (read>=size_inf and read<= size_sup):
298 n += 1
299 return n
300
301 def readsizes (self):
302 '''return a dictionary of number of reads by size (the keys)'''
303 dicsize = {}
304 for offset in self.readDict:
305 for size in self.readDict[offset]:
306 dicsize[size] = dicsize.get(size, 0) + 1
307 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
308 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
309 return dicsize
310
311 def statsizes (self, upstream_coord=None, downstream_coord=None):
312 ''' migration to memory saving by specifying possible subcoordinates
313 see the readcount method for further discussion'''
314 upstream_coord = upstream_coord or self.windowoffset
315 downstream_coord = downstream_coord or self.windowoffset+self.size-1
316 L = []
317 for offset in self.readDict:
318 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
319 for size in self.readDict[offset]:
320 L.append(size)
321 meansize = mean(L)
322 stdv = std(L)
323 mediansize = median(L)
324 return meansize, mediansize, stdv
325
326 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
327 ''' migration to memory saving by specifying possible subcoordinates
328 see the readcount method for further discussion'''
329 upstream_coord = upstream_coord or self.windowoffset
330 downstream_coord = downstream_coord or self.windowoffset+self.size-1
331 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
332 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
333
334 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
335 ''' migration to memory saving by specifying possible subcoordinates
336 see the readcount method for further discussion. size_scope must be an interable'''
337 upstream_coord = upstream_coord or self.windowoffset
338 downstream_coord = downstream_coord or self.windowoffset+self.size-1
339 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
340 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
341 for offset in self.readDict:
342 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
343 for size in self.readDict[offset]:
344 if size in size_scope:
345 startbase = self.sequence[abs(offset)-self.windowoffset]
346 if offset < 0:
347 startbase = convertDic[startbase]
348 freqDic[startbase] += 1
349 base_sum = float ( sum( freqDic.values()) )
350 if base_sum == 0:
351 return "."
352 else:
353 return freqDic["T"] / base_sum * 100
354
355 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
356 ''' migration to memory saving by specifying possible subcoordinates
357 see the readcount method for further discussion. size_scope must be an interable
358 This method is similar to the Ufreq method but take strandness into account'''
359 upstream_coord = upstream_coord or self.windowoffset
360 downstream_coord = downstream_coord or self.windowoffset+self.size-1
361 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
362 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
363 for offset in self.readDict:
364 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
365 for size in self.readDict[offset]:
366 if size in size_scope:
367 startbase = self.sequence[abs(offset)-self.windowoffset]
368 if offset < 0:
369 startbase = convertDic[startbase]
370 freqDic[startbase+"rev"] += 1
371 else:
372 freqDic[startbase+"for"] += 1
373 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
374 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
375 if forward_sum == 0 and reverse_sum == 0:
376 return ". | ."
377 elif reverse_sum == 0:
378 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
379 elif forward_sum == 0:
380 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
381 else:
382 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
383
384
385 def readplot (self):
386 readmap = {}
387 for offset in self.readDict.keys():
388 readmap[abs(offset)] = ( len(self.readDict[-abs(offset)]) , len(self.readDict[abs(offset)]) )
389 mylist = []
390 for offset in sorted(readmap):
391 if readmap[offset][1] != 0:
392 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
393 if readmap[offset][0] != 0:
394 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
395 return mylist
396
397 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
398 '''Use by MirParser tool'''
399 upstream_coord = upstream_coord or 1
400 downstream_coord = downstream_coord or self.size
401 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
402 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
403 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
404 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
405 if totalforward > totalreverse:
406 majorcoverage = "forward"
407 for offset in self.readDict.keys():
408 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
409 for read in self.readDict[offset]:
410 for i in xrange(read):
411 try:
412 forORrev_coverage[offset-upstream_coord+1+i] += 1
413 except KeyError:
414 continue # a sense read may span over the downstream limit
415 else:
416 majorcoverage = "reverse"
417 for offset in self.readDict.keys():
418 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
419 for read in self.readDict[offset]:
420 for i in xrange(read):
421 try:
422 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
423 except KeyError:
424 continue # an antisense read may span over the upstream limit
425 output_list = []
426 maximum = max (forORrev_coverage.values()) or 1
427 for n in sorted (forORrev_coverage):
428 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
429 return "\n".join(output_list)
430
431
432 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
433 ''' migration to memory saving by specifying possible subcoordinates
434 see the readcount method for further discussion
435 scope must be a python iterable; scope define the *relative* offset range to be computed'''
436 upstream_coord = upstream_coord or self.windowoffset
437 downstream_coord = downstream_coord or self.windowoffset+self.size-1
438 query_range = range (minquery, maxquery+1)
439 target_range = range (mintarget, maxtarget+1)
440 Query_table = {}
441 Target_table = {}
442 frequency_table = dict ([(i, 0) for i in scope])
443 for offset in self.readDict:
444 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
445 for size in self.readDict[offset]:
446 if size in query_range:
447 Query_table[offset] = Query_table.get(offset, 0) + 1
448 if size in target_range:
449 Target_table[offset] = Target_table.get(offset, 0) + 1
450 for offset in Query_table:
451 for i in scope:
452 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
453 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
454 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
455 if zscore == "yes":
456 z_mean = mean(frequency_table.values() )
457 z_std = std(frequency_table.values() )
458 if z_std == 0:
459 frequency_table = dict([(i,0) for i in frequency_table] )
460 else:
461 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
462 return frequency_table
463
464 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
465 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
466 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
467 upstream_coord = upstream_coord or self.windowoffset
468 downstream_coord = downstream_coord or self.windowoffset+self.size-1
469 query_range = range (minquery, maxquery+1)
470 target_range = range (mintarget, maxtarget+1)
471 Query_table = {}
472 Target_table = {}
473 Total_Query_Numb = 0
474 general_frequency_table = dict ([(i,0) for i in scope])
475 ## filtering the appropriate reads for the study
476 for offset in self.readDict:
477 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
478 for size in self.readDict[offset]:
479 if size in query_range:
480 Query_table[offset] = Query_table.get(offset, 0) + 1
481 Total_Query_Numb += 1
482 if size in target_range:
483 Target_table[offset] = Target_table.get(offset, 0) + 1
484 for offset in Query_table:
485 frequency_table = dict ([(i,0) for i in scope])
486 number_of_targets = 0
487 for i in scope:
488 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
489 number_of_targets += Target_table.get(-offset -i +1, 0)
490 for i in scope:
491 try:
492 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
493 except ZeroDivisionError :
494 continue
495 return general_frequency_table
496
497 def phasing (self, size_range, scope):
498 ''' to calculate autocorelation like signal - scope must be an python iterable'''
499 read_table = {}
500 total_read_number = 0
501 general_frequency_table = dict ([(i, 0) for i in scope])
502 ## read input filtering
503 for offset in self.readDict:
504 for size in self.readDict[offset]:
505 if size in size_range:
506 read_table[offset] = read_table.get(offset, 0) + 1
507 total_read_number += 1
508 ## per offset read phasing computing
509 for offset in read_table:
510 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
511 number_of_targets = 0
512 for i in scope:
513 if offset > 0:
514 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
515 number_of_targets += read_table.get(offset + i, 0)
516 else:
517 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
518 number_of_targets += read_table.get(offset - i, 0)
519 ## inclusion of local frequency table in the general frequency table (all offsets average)
520 for i in scope:
521 try:
522 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
523 except ZeroDivisionError :
524 continue
525 return general_frequency_table
526
527
528
529 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
530 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
531 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
532 z_table = {}
533 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
534 if std(frequency_list):
535 meanlist = mean(frequency_list)
536 stdlist = std(frequency_list)
537 z_list = [(i-meanlist)/stdlist for i in frequency_list]
538 return dict (zip (sorted(frequency_table), z_list) )
539 else:
540 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
541
542 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
543 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
544 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
545 if total == 0:
546 return dict( [(i,0) for i in scope])
547 return dict( [(i, frequency_table[i]/total*100) for i in scope])
548
549 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
550 queryhash = defaultdict(list)
551 targethash = defaultdict(list)
552 query_range = range (int(minquery), int(maxquery)+1)
553 target_range = range (int(mintarget), int(maxtarget)+1)
554 paired_sequences = []
555 for offset in self.readDict: # selection of data
556 for size in self.readDict[offset]:
557 if size in query_range:
558 queryhash[offset].append(size)
559 if size in target_range:
560 targethash[offset].append(size)
561 for offset in queryhash:
562 if offset >= 0: matched_offset = -offset - overlap + 1
563 else: matched_offset = -offset - overlap + 1
564 if targethash[matched_offset]:
565 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
566 if offset >= 0:
567 for i in range (paired):
568 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
569 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
570 if offset < 0:
571 for i in range (paired):
572 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
573 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
574 return paired_sequences
575
576 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
577 queryhash = defaultdict(list)
578 targethash = defaultdict(list)
579 query_range = range (int(minquery), int(maxquery)+1)
580 target_range = range (int(mintarget), int(maxtarget)+1)
581 paired_sequences = []
582
583 for offset in self.readDict: # selection of data
584 for size in self.readDict[offset]:
585 if size in query_range:
586 queryhash[offset].append(size)
587 if size in target_range:
588 targethash[offset].append(size)
589
590 for offset in queryhash:
591 matched_offset = -offset - overlap + 1
592 if targethash[matched_offset]:
593 if offset >= 0:
594 for i in queryhash[offset]:
595 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
596 for i in targethash[matched_offset]:
597 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
598 if offset < 0:
599 for i in queryhash[offset]:
600 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
601 for i in targethash[matched_offset]:
602 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
603 return paired_sequences
604
605 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
606 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
607 queryhash = defaultdict(list)
608 targethash = defaultdict(list)
609 query_range = range (int(minquery), int(maxquery)+1)
610 target_range = range (int(mintarget), int(maxtarget)+1)
611 bowtie_output = []
612
613 for offset in self.readDict: # selection of data
614 for size in self.readDict[offset]:
615 if size in query_range:
616 queryhash[offset].append(size)
617 if size in target_range:
618 targethash[offset].append(size)
619 counter = 0
620 for offset in queryhash:
621 matched_offset = -offset - overlap + 1
622 if targethash[matched_offset]:
623 if offset >= 0:
624 for i in queryhash[offset]:
625 counter += 1
626 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
627 if offset < 0:
628 for i in queryhash[offset]:
629 counter += 1
630 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
631 return bowtie_output
632
633
634 def __main__(bowtie_index_path, bowtie_output_path):
635 sequenceDic = get_fasta (bowtie_index_path)
636 objDic = {}
637 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
638 for line in F:
639 fields = line.split()
640 polarity = fields[1]
641 gene = fields[2]
642 offset = int(fields[3])
643 size = len (fields[4])
644 try:
645 objDic[gene].addread (polarity, offset, size)
646 except KeyError:
647 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
648 objDic[gene].addread (polarity, offset, size)
649 F.close()
650 for gene in objDic:
651 print gene, objDic[gene].pairer(19,19,23,19,23)
652
653 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])