7
|
1 #!/usr/bin/python
|
|
2 # version 1 7-5-2012 unification of the SmRNAwindow class
|
|
3
|
|
4 import sys, subprocess
|
|
5 from collections import defaultdict, OrderedDict
|
|
6 from numpy import mean, median, std
|
|
7 from scipy import stats
|
|
8
|
|
9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
|
|
10 '''This function will return a dictionary containing fasta identifiers as keys and the
|
|
11 sequence as values. Index must be the path to a fasta file.'''
|
|
12 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
|
|
13 outputlines = p.stdout.readlines()
|
|
14 p.wait()
|
|
15 item_dic = {}
|
|
16 for line in outputlines:
|
|
17 if (line[0] == ">"):
|
|
18 try:
|
|
19 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
|
|
20 except: pass
|
|
21 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
|
|
22 item_dic[current_item] = ""
|
|
23 stringlist=[]
|
|
24 else:
|
|
25 stringlist.append(line.rstrip() )
|
|
26 item_dic[current_item] = "".join(stringlist) # for the last item
|
|
27 return item_dic
|
|
28
|
|
29 def get_fasta_headers (index):
|
|
30 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
|
|
31 outputlines = p.stdout.readlines()
|
|
32 p.wait()
|
|
33 item_dic = {}
|
|
34 for line in outputlines:
|
|
35 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
|
|
36 item_dic[header] = 1
|
|
37 return item_dic
|
|
38
|
|
39
|
|
40 def get_file_sample (file, numberoflines):
|
|
41 '''import random to use this function'''
|
|
42 F=open(file)
|
|
43 fullfile = F.read().splitlines()
|
|
44 F.close()
|
|
45 if len(fullfile) < numberoflines:
|
|
46 return "sample size exceeds file size"
|
|
47 return random.sample(fullfile, numberoflines)
|
|
48
|
|
49 def get_fasta_from_history (file):
|
|
50 F = open (file, "r")
|
|
51 item_dic = {}
|
|
52 for line in F:
|
|
53 if (line[0] == ">"):
|
|
54 try:
|
|
55 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
|
|
56 except: pass
|
|
57 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
|
|
58 item_dic[current_item] = ""
|
|
59 stringlist=[]
|
|
60 else:
|
|
61 stringlist.append(line[:-1])
|
|
62 item_dic[current_item] = "".join(stringlist) # for the last item
|
|
63 return item_dic
|
|
64
|
|
65 def antipara (sequence):
|
|
66 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
|
|
67 revseq = sequence[::-1]
|
|
68 return "".join([antidict[i] for i in revseq])
|
|
69
|
|
70 def RNAtranslate (sequence):
|
|
71 return "".join([i if i in "AGCN" else "U" for i in sequence])
|
|
72 def DNAtranslate (sequence):
|
|
73 return "".join([i if i in "AGCN" else "T" for i in sequence])
|
|
74
|
|
75 def RNAfold (sequence_list):
|
|
76 thestring= "\n".join(sequence_list)
|
|
77 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
78 output=p.communicate(thestring)[0]
|
|
79 p.wait()
|
|
80 output=output.split("\n")
|
|
81 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
|
|
82 buffer=[]
|
|
83 for line in output:
|
|
84 if line[0] in ["N","A","T","U","G","C"]:
|
|
85 buffer.append(DNAtranslate(line))
|
|
86 if line[0] in ["(",".",")"]:
|
|
87 fields=line.split("(")
|
|
88 energy= fields[-1]
|
|
89 energy = energy[:-1] # remove the ) parenthesis
|
|
90 energy=float(energy)
|
|
91 buffer.append(str(energy))
|
|
92 return dict(zip(buffer[::2], buffer[1::2]))
|
|
93
|
|
94 def extractsubinstance (start, end, instance):
|
|
95 ''' Testing whether this can be an function external to the class to save memory'''
|
|
96 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
|
|
97 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
|
|
98 upcoordinate = [i for i in range(start,end+1) if instance.readDict[i] ]
|
|
99 downcoordinate = [-i for i in range(start,end+1) if instance.readDict[-i] ]
|
|
100 for i in upcoordinate:
|
|
101 subinstance.readDict[i]=instance.readDict[i]
|
|
102 for i in downcoordinate:
|
|
103 subinstance.readDict[i]=instance.readDict[i]
|
|
104 return subinstance
|
|
105
|
|
106 class HandleSmRNAwindows:
|
|
107 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined"):
|
|
108 self.biosample = biosample
|
|
109 self.alignmentFile = alignmentFile
|
|
110 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
|
|
111 self.genomeRefFile = genomeRefFile
|
|
112 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
|
|
113 self.alignedReads = 0
|
|
114 self.instanceDict = {}
|
|
115 if genomeRefFormat == "bowtieIndex":
|
|
116 self.itemDict = get_fasta (genomeRefFile)
|
|
117 elif genomeRefFormat == "fastaSource":
|
|
118 self.itemDict = get_fasta_from_history (genomeRefFile)
|
|
119 for item in self.itemDict:
|
|
120 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample) # create as many instances as there is items
|
|
121 self.readfile()
|
|
122
|
|
123 def readfile (self) :
|
|
124 if self.alignmentFileFormat == "tabular":
|
|
125 F = open (self.alignmentFile, "r")
|
|
126 for line in F:
|
|
127 fields = line.split()
|
|
128 polarity = fields[1]
|
|
129 gene = fields[2]
|
|
130 offset = int(fields[3])
|
|
131 size = len (fields[4])
|
|
132 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
|
|
133 self.alignedReads += 1
|
|
134 F.close()
|
|
135 elif self.alignmentFileFormat == "sam":
|
|
136 F = open (self.alignmentFile, "r")
|
|
137 dict = {"0":"+", "16":"-"}
|
|
138 for line in F:
|
|
139 if line[0]=='@':
|
|
140 continue
|
|
141 fields = line.split()
|
|
142 if fields[2] == "*": continue
|
|
143 polarity = dict[fields[1]]
|
|
144 gene = fields[2]
|
|
145 offset = int(fields[3])
|
|
146 size = len (fields[9])
|
|
147 self.instanceDict[gene].addread (polarity, offset, size) # sam format is already 1-based coordinates
|
|
148 self.alignedReads += 1
|
|
149 F.close()
|
|
150 elif self.alignmentFileFormat == "bam":
|
|
151 import pysam
|
|
152 samfile = pysam.Samfile(self.alignmentFile)
|
|
153 for read in samfile:
|
|
154 if read.tid == -1:
|
|
155 continue # filter out unaligned reads
|
|
156 if read.is_reverse:
|
|
157 polarity="-"
|
|
158 else:
|
|
159 polarity="+"
|
|
160 gene = samfile.getrname(read.tid)
|
|
161 offset = read.pos
|
|
162 size = read.qlen
|
|
163 self.instanceDict[gene].addread (polarity, offset+1, size) # pysam converts coordinates to 0-based (https://media.readthedocs.org/pdf/pysam/latest/pysam.pdf)
|
|
164 self.alignedReads += 1
|
|
165 return
|
|
166
|
|
167 def CountFeatures (self, GFF3="path/to/file"):
|
|
168 featureDict = defaultdict(int)
|
|
169 F = open (GFF3, "r")
|
|
170 for line in F:
|
|
171 if line[0] == "#": continue
|
|
172 fields = line[:-1].split()
|
|
173 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
|
|
174 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
|
|
175 F.close()
|
|
176 return featureDict
|
|
177
|
|
178 class SmRNAwindow:
|
|
179
|
|
180 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined"):
|
|
181 self.biosample = biosample
|
|
182 self.sequence = sequence
|
|
183 self.gene = gene
|
|
184 self.windowoffset = windowoffset
|
|
185 self.size = len(sequence)
|
|
186 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
|
|
187 self.matchedreadsUp = 0
|
|
188 self.matchedreadsDown = 0
|
|
189
|
|
190 def addread (self, polarity, offset, size):
|
|
191 '''ATTENTION ATTENTION ATTENTION'''
|
|
192 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
|
|
193 if polarity == "+":
|
|
194 self.readDict[offset].append(size)
|
|
195 self.matchedreadsUp += 1
|
|
196 else:
|
|
197 self.readDict[-(offset + size -1)].append(size)
|
|
198 self.matchedreadsDown += 1
|
|
199 return
|
|
200
|
|
201 def barycenter (self, upstream_coord=None, downstream_coord=None):
|
|
202 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
|
|
203 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
|
|
204 upstream_coord = upstream_coord or self.windowoffset
|
|
205 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
206 window_size = downstream_coord - upstream_coord +1
|
|
207 def weigthAverage (TuppleList):
|
|
208 weightSum = 0
|
|
209 PonderWeightSum = 0
|
|
210 for tuple in TuppleList:
|
|
211 PonderWeightSum += tuple[0] * tuple[1]
|
|
212 weightSum += tuple[1]
|
|
213 if weightSum > 0:
|
|
214 return PonderWeightSum / float(weightSum)
|
|
215 else:
|
|
216 return 0
|
|
217 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
|
|
218 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
|
|
219 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
|
|
220 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
|
|
221 return Fbarycenter, Rbarycenter
|
|
222
|
|
223 def correlation_mapper (self, reference, window_size):
|
|
224 '''to map correlation with a sliding window 26-2-2013'''
|
|
225 if window_size > self.size:
|
|
226 return []
|
|
227 F=open(reference, "r")
|
|
228 reference_forward = []
|
|
229 reference_reverse = []
|
|
230 for line in F:
|
|
231 fields=line.split()
|
|
232 reference_forward.append(int(float(fields[1])))
|
|
233 reference_reverse.append(int(float(fields[2])))
|
|
234 F.close()
|
|
235 local_object_forward=[]
|
|
236 local_object_reverse=[]
|
|
237 ## Dict to list for the local object
|
|
238 for i in range(1, self.size+1):
|
|
239 local_object_forward.append(len(self.readDict[i]))
|
|
240 local_object_reverse.append(len(self.readDict[-i]))
|
|
241 ## start compiling results by slides
|
|
242 results=[]
|
|
243 for coordinate in range(self.size - window_size):
|
|
244 local_forward=local_object_forward[coordinate:coordinate + window_size]
|
|
245 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
|
|
246 if sum(local_forward) == 0 or sum(local_reverse) == 0:
|
|
247 continue
|
|
248 try:
|
|
249 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
|
|
250 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
|
|
251 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
|
|
252 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
|
|
253 except:
|
|
254 pass
|
|
255 return results
|
|
256
|
|
257 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
|
|
258 '''refactored 24-12-2013 to save memory and introduce offset filtering
|
|
259 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
|
|
260 the trick is to pass None and then test
|
|
261 polarity parameter can take "both", "forward" or "reverse" as value'''
|
|
262 upstream_coord = upstream_coord or self.windowoffset
|
|
263 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
264 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
|
|
265 return self.matchedreadsUp + self.matchedreadsDown
|
|
266 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
|
|
267 return self.matchedreadsUp
|
|
268 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
|
|
269 return self.matchedreadsDown
|
|
270 n=0
|
|
271 if polarity == "both":
|
|
272 for offset in xrange(upstream_coord, downstream_coord+1):
|
|
273 if self.readDict.has_key(offset):
|
|
274 for read in self.readDict[offset]:
|
|
275 if (read>=size_inf and read<= size_sup):
|
|
276 n += 1
|
|
277 if method != "conservative":
|
|
278 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
|
|
279 if self.readDict.has_key(-offset):
|
|
280 for read in self.readDict[-offset]:
|
|
281 if (read>=size_inf and read<= size_sup):
|
|
282 n += 1
|
|
283 if method != "conservative":
|
|
284 del self.readDict[-offset]
|
|
285 return n
|
|
286 elif polarity == "forward":
|
|
287 for offset in xrange(upstream_coord, downstream_coord+1):
|
|
288 if self.readDict.has_key(offset):
|
|
289 for read in self.readDict[offset]:
|
|
290 if (read>=size_inf and read<= size_sup):
|
|
291 n += 1
|
|
292 return n
|
|
293 elif polarity == "reverse":
|
|
294 for offset in xrange(upstream_coord, downstream_coord+1):
|
|
295 if self.readDict.has_key(-offset):
|
|
296 for read in self.readDict[-offset]:
|
|
297 if (read>=size_inf and read<= size_sup):
|
|
298 n += 1
|
|
299 return n
|
|
300
|
|
301 def readsizes (self):
|
|
302 '''return a dictionary of number of reads by size (the keys)'''
|
|
303 dicsize = {}
|
|
304 for offset in self.readDict:
|
|
305 for size in self.readDict[offset]:
|
|
306 dicsize[size] = dicsize.get(size, 0) + 1
|
|
307 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
|
|
308 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
|
|
309 return dicsize
|
|
310
|
|
311 def statsizes (self, upstream_coord=None, downstream_coord=None):
|
|
312 ''' migration to memory saving by specifying possible subcoordinates
|
|
313 see the readcount method for further discussion'''
|
|
314 upstream_coord = upstream_coord or self.windowoffset
|
|
315 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
316 L = []
|
|
317 for offset in self.readDict:
|
|
318 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
|
|
319 for size in self.readDict[offset]:
|
|
320 L.append(size)
|
|
321 meansize = mean(L)
|
|
322 stdv = std(L)
|
|
323 mediansize = median(L)
|
|
324 return meansize, mediansize, stdv
|
|
325
|
|
326 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
|
|
327 ''' migration to memory saving by specifying possible subcoordinates
|
|
328 see the readcount method for further discussion'''
|
|
329 upstream_coord = upstream_coord or self.windowoffset
|
|
330 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
331 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
|
|
332 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
|
|
333
|
|
334 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
|
|
335 ''' migration to memory saving by specifying possible subcoordinates
|
|
336 see the readcount method for further discussion. size_scope must be an interable'''
|
|
337 upstream_coord = upstream_coord or self.windowoffset
|
|
338 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
339 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
|
|
340 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
|
|
341 for offset in self.readDict:
|
|
342 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
|
|
343 for size in self.readDict[offset]:
|
|
344 if size in size_scope:
|
|
345 startbase = self.sequence[abs(offset)-self.windowoffset]
|
|
346 if offset < 0:
|
|
347 startbase = convertDic[startbase]
|
|
348 freqDic[startbase] += 1
|
|
349 base_sum = float ( sum( freqDic.values()) )
|
|
350 if base_sum == 0:
|
|
351 return "."
|
|
352 else:
|
|
353 return freqDic["T"] / base_sum * 100
|
|
354
|
|
355 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
|
|
356 ''' migration to memory saving by specifying possible subcoordinates
|
|
357 see the readcount method for further discussion. size_scope must be an interable
|
|
358 This method is similar to the Ufreq method but take strandness into account'''
|
|
359 upstream_coord = upstream_coord or self.windowoffset
|
|
360 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
361 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
|
|
362 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
|
|
363 for offset in self.readDict:
|
|
364 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
|
|
365 for size in self.readDict[offset]:
|
|
366 if size in size_scope:
|
|
367 startbase = self.sequence[abs(offset)-self.windowoffset]
|
|
368 if offset < 0:
|
|
369 startbase = convertDic[startbase]
|
|
370 freqDic[startbase+"rev"] += 1
|
|
371 else:
|
|
372 freqDic[startbase+"for"] += 1
|
|
373 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
|
|
374 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
|
|
375 if forward_sum == 0 and reverse_sum == 0:
|
|
376 return ". | ."
|
|
377 elif reverse_sum == 0:
|
|
378 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
|
|
379 elif forward_sum == 0:
|
|
380 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
|
|
381 else:
|
|
382 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
|
|
383
|
|
384
|
|
385 def readplot (self):
|
|
386 readmap = {}
|
|
387 for offset in self.readDict.keys():
|
|
388 readmap[abs(offset)] = ( len(self.readDict[-abs(offset)]) , len(self.readDict[abs(offset)]) )
|
|
389 mylist = []
|
|
390 for offset in sorted(readmap):
|
|
391 if readmap[offset][1] != 0:
|
|
392 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
|
|
393 if readmap[offset][0] != 0:
|
|
394 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
|
|
395 return mylist
|
|
396
|
|
397 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
|
|
398 '''Use by MirParser tool'''
|
|
399 upstream_coord = upstream_coord or 1
|
|
400 downstream_coord = downstream_coord or self.size
|
|
401 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
|
|
402 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
|
|
403 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
|
|
404 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
|
|
405 if totalforward > totalreverse:
|
|
406 majorcoverage = "forward"
|
|
407 for offset in self.readDict.keys():
|
|
408 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
|
|
409 for read in self.readDict[offset]:
|
|
410 for i in xrange(read):
|
|
411 try:
|
|
412 forORrev_coverage[offset-upstream_coord+1+i] += 1
|
|
413 except KeyError:
|
|
414 continue # a sense read may span over the downstream limit
|
|
415 else:
|
|
416 majorcoverage = "reverse"
|
|
417 for offset in self.readDict.keys():
|
|
418 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
|
|
419 for read in self.readDict[offset]:
|
|
420 for i in xrange(read):
|
|
421 try:
|
|
422 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
|
|
423 except KeyError:
|
|
424 continue # an antisense read may span over the upstream limit
|
|
425 output_list = []
|
|
426 maximum = max (forORrev_coverage.values()) or 1
|
|
427 for n in sorted (forORrev_coverage):
|
|
428 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
|
|
429 return "\n".join(output_list)
|
|
430
|
|
431
|
|
432 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
|
|
433 ''' migration to memory saving by specifying possible subcoordinates
|
|
434 see the readcount method for further discussion
|
|
435 scope must be a python iterable; scope define the *relative* offset range to be computed'''
|
|
436 upstream_coord = upstream_coord or self.windowoffset
|
|
437 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
438 query_range = range (minquery, maxquery+1)
|
|
439 target_range = range (mintarget, maxtarget+1)
|
|
440 Query_table = {}
|
|
441 Target_table = {}
|
|
442 frequency_table = dict ([(i, 0) for i in scope])
|
|
443 for offset in self.readDict:
|
|
444 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
|
|
445 for size in self.readDict[offset]:
|
|
446 if size in query_range:
|
|
447 Query_table[offset] = Query_table.get(offset, 0) + 1
|
|
448 if size in target_range:
|
|
449 Target_table[offset] = Target_table.get(offset, 0) + 1
|
|
450 for offset in Query_table:
|
|
451 for i in scope:
|
|
452 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
|
|
453 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
|
|
454 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
|
|
455 if zscore == "yes":
|
|
456 z_mean = mean(frequency_table.values() )
|
|
457 z_std = std(frequency_table.values() )
|
|
458 if z_std == 0:
|
|
459 frequency_table = dict([(i,0) for i in frequency_table] )
|
|
460 else:
|
|
461 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
|
|
462 return frequency_table
|
|
463
|
|
464 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
|
|
465 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
|
|
466 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
|
|
467 upstream_coord = upstream_coord or self.windowoffset
|
|
468 downstream_coord = downstream_coord or self.windowoffset+self.size-1
|
|
469 query_range = range (minquery, maxquery+1)
|
|
470 target_range = range (mintarget, maxtarget+1)
|
|
471 Query_table = {}
|
|
472 Target_table = {}
|
|
473 Total_Query_Numb = 0
|
|
474 general_frequency_table = dict ([(i,0) for i in scope])
|
|
475 ## filtering the appropriate reads for the study
|
|
476 for offset in self.readDict:
|
|
477 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
|
|
478 for size in self.readDict[offset]:
|
|
479 if size in query_range:
|
|
480 Query_table[offset] = Query_table.get(offset, 0) + 1
|
|
481 Total_Query_Numb += 1
|
|
482 if size in target_range:
|
|
483 Target_table[offset] = Target_table.get(offset, 0) + 1
|
|
484 for offset in Query_table:
|
|
485 frequency_table = dict ([(i,0) for i in scope])
|
|
486 number_of_targets = 0
|
|
487 for i in scope:
|
|
488 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
|
|
489 number_of_targets += Target_table.get(-offset -i +1, 0)
|
|
490 for i in scope:
|
|
491 try:
|
|
492 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
|
|
493 except ZeroDivisionError :
|
|
494 continue
|
|
495 return general_frequency_table
|
|
496
|
|
497 def phasing (self, size_range, scope):
|
|
498 ''' to calculate autocorelation like signal - scope must be an python iterable'''
|
|
499 read_table = {}
|
|
500 total_read_number = 0
|
|
501 general_frequency_table = dict ([(i, 0) for i in scope])
|
|
502 ## read input filtering
|
|
503 for offset in self.readDict:
|
|
504 for size in self.readDict[offset]:
|
|
505 if size in size_range:
|
|
506 read_table[offset] = read_table.get(offset, 0) + 1
|
|
507 total_read_number += 1
|
|
508 ## per offset read phasing computing
|
|
509 for offset in read_table:
|
|
510 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
|
|
511 number_of_targets = 0
|
|
512 for i in scope:
|
|
513 if offset > 0:
|
|
514 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
|
|
515 number_of_targets += read_table.get(offset + i, 0)
|
|
516 else:
|
|
517 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
|
|
518 number_of_targets += read_table.get(offset - i, 0)
|
|
519 ## inclusion of local frequency table in the general frequency table (all offsets average)
|
|
520 for i in scope:
|
|
521 try:
|
|
522 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
|
|
523 except ZeroDivisionError :
|
|
524 continue
|
|
525 return general_frequency_table
|
|
526
|
|
527
|
|
528
|
|
529 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
|
|
530 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
|
|
531 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
|
|
532 z_table = {}
|
|
533 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
|
|
534 if std(frequency_list):
|
|
535 meanlist = mean(frequency_list)
|
|
536 stdlist = std(frequency_list)
|
|
537 z_list = [(i-meanlist)/stdlist for i in frequency_list]
|
|
538 return dict (zip (sorted(frequency_table), z_list) )
|
|
539 else:
|
|
540 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
|
|
541
|
|
542 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
|
|
543 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
|
|
544 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
|
|
545 if total == 0:
|
|
546 return dict( [(i,0) for i in scope])
|
|
547 return dict( [(i, frequency_table[i]/total*100) for i in scope])
|
|
548
|
|
549 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
|
|
550 queryhash = defaultdict(list)
|
|
551 targethash = defaultdict(list)
|
|
552 query_range = range (int(minquery), int(maxquery)+1)
|
|
553 target_range = range (int(mintarget), int(maxtarget)+1)
|
|
554 paired_sequences = []
|
|
555 for offset in self.readDict: # selection of data
|
|
556 for size in self.readDict[offset]:
|
|
557 if size in query_range:
|
|
558 queryhash[offset].append(size)
|
|
559 if size in target_range:
|
|
560 targethash[offset].append(size)
|
|
561 for offset in queryhash:
|
|
562 if offset >= 0: matched_offset = -offset - overlap + 1
|
|
563 else: matched_offset = -offset - overlap + 1
|
|
564 if targethash[matched_offset]:
|
|
565 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
|
|
566 if offset >= 0:
|
|
567 for i in range (paired):
|
|
568 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
|
|
569 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
|
|
570 if offset < 0:
|
|
571 for i in range (paired):
|
|
572 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
|
|
573 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
|
|
574 return paired_sequences
|
|
575
|
|
576 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
|
|
577 queryhash = defaultdict(list)
|
|
578 targethash = defaultdict(list)
|
|
579 query_range = range (int(minquery), int(maxquery)+1)
|
|
580 target_range = range (int(mintarget), int(maxtarget)+1)
|
|
581 paired_sequences = []
|
|
582
|
|
583 for offset in self.readDict: # selection of data
|
|
584 for size in self.readDict[offset]:
|
|
585 if size in query_range:
|
|
586 queryhash[offset].append(size)
|
|
587 if size in target_range:
|
|
588 targethash[offset].append(size)
|
|
589
|
|
590 for offset in queryhash:
|
|
591 matched_offset = -offset - overlap + 1
|
|
592 if targethash[matched_offset]:
|
|
593 if offset >= 0:
|
|
594 for i in queryhash[offset]:
|
|
595 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
|
|
596 for i in targethash[matched_offset]:
|
|
597 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
|
|
598 if offset < 0:
|
|
599 for i in queryhash[offset]:
|
|
600 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
|
|
601 for i in targethash[matched_offset]:
|
|
602 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
|
|
603 return paired_sequences
|
|
604
|
|
605 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
|
|
606 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
|
|
607 queryhash = defaultdict(list)
|
|
608 targethash = defaultdict(list)
|
|
609 query_range = range (int(minquery), int(maxquery)+1)
|
|
610 target_range = range (int(mintarget), int(maxtarget)+1)
|
|
611 bowtie_output = []
|
|
612
|
|
613 for offset in self.readDict: # selection of data
|
|
614 for size in self.readDict[offset]:
|
|
615 if size in query_range:
|
|
616 queryhash[offset].append(size)
|
|
617 if size in target_range:
|
|
618 targethash[offset].append(size)
|
|
619 counter = 0
|
|
620 for offset in queryhash:
|
|
621 matched_offset = -offset - overlap + 1
|
|
622 if targethash[matched_offset]:
|
|
623 if offset >= 0:
|
|
624 for i in queryhash[offset]:
|
|
625 counter += 1
|
|
626 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
|
|
627 if offset < 0:
|
|
628 for i in queryhash[offset]:
|
|
629 counter += 1
|
|
630 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
|
|
631 return bowtie_output
|
|
632
|
|
633
|
|
634 def __main__(bowtie_index_path, bowtie_output_path):
|
|
635 sequenceDic = get_fasta (bowtie_index_path)
|
|
636 objDic = {}
|
|
637 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
|
|
638 for line in F:
|
|
639 fields = line.split()
|
|
640 polarity = fields[1]
|
|
641 gene = fields[2]
|
|
642 offset = int(fields[3])
|
|
643 size = len (fields[4])
|
|
644 try:
|
|
645 objDic[gene].addread (polarity, offset, size)
|
|
646 except KeyError:
|
|
647 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
|
|
648 objDic[gene].addread (polarity, offset, size)
|
|
649 F.close()
|
|
650 for gene in objDic:
|
|
651 print gene, objDic[gene].pairer(19,19,23,19,23)
|
|
652
|
|
653 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])
|