Mercurial > repos > drosofff > mi_rna_parser
comparison smRtools.py @ 7:20b8ff9c1cb9 draft default tip
Uploaded
author | drosofff |
---|---|
date | Mon, 23 Jun 2014 05:24:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:11d042a0fbdf | 7:20b8ff9c1cb9 |
---|---|
1 #!/usr/bin/python | |
2 # version 1 7-5-2012 unification of the SmRNAwindow class | |
3 | |
4 import sys, subprocess | |
5 from collections import defaultdict, OrderedDict | |
6 from numpy import mean, median, std | |
7 from scipy import stats | |
8 | |
9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"): | |
10 '''This function will return a dictionary containing fasta identifiers as keys and the | |
11 sequence as values. Index must be the path to a fasta file.''' | |
12 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | |
13 outputlines = p.stdout.readlines() | |
14 p.wait() | |
15 item_dic = {} | |
16 for line in outputlines: | |
17 if (line[0] == ">"): | |
18 try: | |
19 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | |
20 except: pass | |
21 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers ! | |
22 item_dic[current_item] = "" | |
23 stringlist=[] | |
24 else: | |
25 stringlist.append(line.rstrip() ) | |
26 item_dic[current_item] = "".join(stringlist) # for the last item | |
27 return item_dic | |
28 | |
29 def get_fasta_headers (index): | |
30 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | |
31 outputlines = p.stdout.readlines() | |
32 p.wait() | |
33 item_dic = {} | |
34 for line in outputlines: | |
35 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers ! | |
36 item_dic[header] = 1 | |
37 return item_dic | |
38 | |
39 | |
40 def get_file_sample (file, numberoflines): | |
41 '''import random to use this function''' | |
42 F=open(file) | |
43 fullfile = F.read().splitlines() | |
44 F.close() | |
45 if len(fullfile) < numberoflines: | |
46 return "sample size exceeds file size" | |
47 return random.sample(fullfile, numberoflines) | |
48 | |
49 def get_fasta_from_history (file): | |
50 F = open (file, "r") | |
51 item_dic = {} | |
52 for line in F: | |
53 if (line[0] == ">"): | |
54 try: | |
55 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | |
56 except: pass | |
57 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers ! | |
58 item_dic[current_item] = "" | |
59 stringlist=[] | |
60 else: | |
61 stringlist.append(line[:-1]) | |
62 item_dic[current_item] = "".join(stringlist) # for the last item | |
63 return item_dic | |
64 | |
65 def antipara (sequence): | |
66 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"} | |
67 revseq = sequence[::-1] | |
68 return "".join([antidict[i] for i in revseq]) | |
69 | |
70 def RNAtranslate (sequence): | |
71 return "".join([i if i in "AGCN" else "U" for i in sequence]) | |
72 def DNAtranslate (sequence): | |
73 return "".join([i if i in "AGCN" else "T" for i in sequence]) | |
74 | |
75 def RNAfold (sequence_list): | |
76 thestring= "\n".join(sequence_list) | |
77 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
78 output=p.communicate(thestring)[0] | |
79 p.wait() | |
80 output=output.split("\n") | |
81 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line | |
82 buffer=[] | |
83 for line in output: | |
84 if line[0] in ["N","A","T","U","G","C"]: | |
85 buffer.append(DNAtranslate(line)) | |
86 if line[0] in ["(",".",")"]: | |
87 fields=line.split("(") | |
88 energy= fields[-1] | |
89 energy = energy[:-1] # remove the ) parenthesis | |
90 energy=float(energy) | |
91 buffer.append(str(energy)) | |
92 return dict(zip(buffer[::2], buffer[1::2])) | |
93 | |
94 def extractsubinstance (start, end, instance): | |
95 ''' Testing whether this can be an function external to the class to save memory''' | |
96 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start) | |
97 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1) | |
98 upcoordinate = [i for i in range(start,end+1) if instance.readDict[i] ] | |
99 downcoordinate = [-i for i in range(start,end+1) if instance.readDict[-i] ] | |
100 for i in upcoordinate: | |
101 subinstance.readDict[i]=instance.readDict[i] | |
102 for i in downcoordinate: | |
103 subinstance.readDict[i]=instance.readDict[i] | |
104 return subinstance | |
105 | |
106 class HandleSmRNAwindows: | |
107 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined"): | |
108 self.biosample = biosample | |
109 self.alignmentFile = alignmentFile | |
110 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam" | |
111 self.genomeRefFile = genomeRefFile | |
112 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource" | |
113 self.alignedReads = 0 | |
114 self.instanceDict = {} | |
115 if genomeRefFormat == "bowtieIndex": | |
116 self.itemDict = get_fasta (genomeRefFile) | |
117 elif genomeRefFormat == "fastaSource": | |
118 self.itemDict = get_fasta_from_history (genomeRefFile) | |
119 for item in self.itemDict: | |
120 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample) # create as many instances as there is items | |
121 self.readfile() | |
122 | |
123 def readfile (self) : | |
124 if self.alignmentFileFormat == "tabular": | |
125 F = open (self.alignmentFile, "r") | |
126 for line in F: | |
127 fields = line.split() | |
128 polarity = fields[1] | |
129 gene = fields[2] | |
130 offset = int(fields[3]) | |
131 size = len (fields[4]) | |
132 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | |
133 self.alignedReads += 1 | |
134 F.close() | |
135 elif self.alignmentFileFormat == "sam": | |
136 F = open (self.alignmentFile, "r") | |
137 dict = {"0":"+", "16":"-"} | |
138 for line in F: | |
139 if line[0]=='@': | |
140 continue | |
141 fields = line.split() | |
142 if fields[2] == "*": continue | |
143 polarity = dict[fields[1]] | |
144 gene = fields[2] | |
145 offset = int(fields[3]) | |
146 size = len (fields[9]) | |
147 self.instanceDict[gene].addread (polarity, offset, size) # sam format is already 1-based coordinates | |
148 self.alignedReads += 1 | |
149 F.close() | |
150 elif self.alignmentFileFormat == "bam": | |
151 import pysam | |
152 samfile = pysam.Samfile(self.alignmentFile) | |
153 for read in samfile: | |
154 if read.tid == -1: | |
155 continue # filter out unaligned reads | |
156 if read.is_reverse: | |
157 polarity="-" | |
158 else: | |
159 polarity="+" | |
160 gene = samfile.getrname(read.tid) | |
161 offset = read.pos | |
162 size = read.qlen | |
163 self.instanceDict[gene].addread (polarity, offset+1, size) # pysam converts coordinates to 0-based (https://media.readthedocs.org/pdf/pysam/latest/pysam.pdf) | |
164 self.alignedReads += 1 | |
165 return | |
166 | |
167 def CountFeatures (self, GFF3="path/to/file"): | |
168 featureDict = defaultdict(int) | |
169 F = open (GFF3, "r") | |
170 for line in F: | |
171 if line[0] == "#": continue | |
172 fields = line[:-1].split() | |
173 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6] | |
174 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive") | |
175 F.close() | |
176 return featureDict | |
177 | |
178 class SmRNAwindow: | |
179 | |
180 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined"): | |
181 self.biosample = biosample | |
182 self.sequence = sequence | |
183 self.gene = gene | |
184 self.windowoffset = windowoffset | |
185 self.size = len(sequence) | |
186 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...} | |
187 self.matchedreadsUp = 0 | |
188 self.matchedreadsDown = 0 | |
189 | |
190 def addread (self, polarity, offset, size): | |
191 '''ATTENTION ATTENTION ATTENTION''' | |
192 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.''' | |
193 if polarity == "+": | |
194 self.readDict[offset].append(size) | |
195 self.matchedreadsUp += 1 | |
196 else: | |
197 self.readDict[-(offset + size -1)].append(size) | |
198 self.matchedreadsDown += 1 | |
199 return | |
200 | |
201 def barycenter (self, upstream_coord=None, downstream_coord=None): | |
202 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that | |
203 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too''' | |
204 upstream_coord = upstream_coord or self.windowoffset | |
205 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
206 window_size = downstream_coord - upstream_coord +1 | |
207 def weigthAverage (TuppleList): | |
208 weightSum = 0 | |
209 PonderWeightSum = 0 | |
210 for tuple in TuppleList: | |
211 PonderWeightSum += tuple[0] * tuple[1] | |
212 weightSum += tuple[1] | |
213 if weightSum > 0: | |
214 return PonderWeightSum / float(weightSum) | |
215 else: | |
216 return 0 | |
217 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window | |
218 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window | |
219 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size | |
220 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size | |
221 return Fbarycenter, Rbarycenter | |
222 | |
223 def correlation_mapper (self, reference, window_size): | |
224 '''to map correlation with a sliding window 26-2-2013''' | |
225 if window_size > self.size: | |
226 return [] | |
227 F=open(reference, "r") | |
228 reference_forward = [] | |
229 reference_reverse = [] | |
230 for line in F: | |
231 fields=line.split() | |
232 reference_forward.append(int(float(fields[1]))) | |
233 reference_reverse.append(int(float(fields[2]))) | |
234 F.close() | |
235 local_object_forward=[] | |
236 local_object_reverse=[] | |
237 ## Dict to list for the local object | |
238 for i in range(1, self.size+1): | |
239 local_object_forward.append(len(self.readDict[i])) | |
240 local_object_reverse.append(len(self.readDict[-i])) | |
241 ## start compiling results by slides | |
242 results=[] | |
243 for coordinate in range(self.size - window_size): | |
244 local_forward=local_object_forward[coordinate:coordinate + window_size] | |
245 local_reverse=local_object_reverse[coordinate:coordinate + window_size] | |
246 if sum(local_forward) == 0 or sum(local_reverse) == 0: | |
247 continue | |
248 try: | |
249 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward) | |
250 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse) | |
251 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2): | |
252 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]]) | |
253 except: | |
254 pass | |
255 return results | |
256 | |
257 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"): | |
258 '''refactored 24-12-2013 to save memory and introduce offset filtering | |
259 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation | |
260 the trick is to pass None and then test | |
261 polarity parameter can take "both", "forward" or "reverse" as value''' | |
262 upstream_coord = upstream_coord or self.windowoffset | |
263 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
264 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both": | |
265 return self.matchedreadsUp + self.matchedreadsDown | |
266 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward": | |
267 return self.matchedreadsUp | |
268 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse": | |
269 return self.matchedreadsDown | |
270 n=0 | |
271 if polarity == "both": | |
272 for offset in xrange(upstream_coord, downstream_coord+1): | |
273 if self.readDict.has_key(offset): | |
274 for read in self.readDict[offset]: | |
275 if (read>=size_inf and read<= size_sup): | |
276 n += 1 | |
277 if method != "conservative": | |
278 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST | |
279 if self.readDict.has_key(-offset): | |
280 for read in self.readDict[-offset]: | |
281 if (read>=size_inf and read<= size_sup): | |
282 n += 1 | |
283 if method != "conservative": | |
284 del self.readDict[-offset] | |
285 return n | |
286 elif polarity == "forward": | |
287 for offset in xrange(upstream_coord, downstream_coord+1): | |
288 if self.readDict.has_key(offset): | |
289 for read in self.readDict[offset]: | |
290 if (read>=size_inf and read<= size_sup): | |
291 n += 1 | |
292 return n | |
293 elif polarity == "reverse": | |
294 for offset in xrange(upstream_coord, downstream_coord+1): | |
295 if self.readDict.has_key(-offset): | |
296 for read in self.readDict[-offset]: | |
297 if (read>=size_inf and read<= size_sup): | |
298 n += 1 | |
299 return n | |
300 | |
301 def readsizes (self): | |
302 '''return a dictionary of number of reads by size (the keys)''' | |
303 dicsize = {} | |
304 for offset in self.readDict: | |
305 for size in self.readDict[offset]: | |
306 dicsize[size] = dicsize.get(size, 0) + 1 | |
307 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1): | |
308 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values | |
309 return dicsize | |
310 | |
311 def statsizes (self, upstream_coord=None, downstream_coord=None): | |
312 ''' migration to memory saving by specifying possible subcoordinates | |
313 see the readcount method for further discussion''' | |
314 upstream_coord = upstream_coord or self.windowoffset | |
315 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
316 L = [] | |
317 for offset in self.readDict: | |
318 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
319 for size in self.readDict[offset]: | |
320 L.append(size) | |
321 meansize = mean(L) | |
322 stdv = std(L) | |
323 mediansize = median(L) | |
324 return meansize, mediansize, stdv | |
325 | |
326 def foldEnergy (self, upstream_coord=None, downstream_coord=None): | |
327 ''' migration to memory saving by specifying possible subcoordinates | |
328 see the readcount method for further discussion''' | |
329 upstream_coord = upstream_coord or self.windowoffset | |
330 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
331 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ]) | |
332 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]]) | |
333 | |
334 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None): | |
335 ''' migration to memory saving by specifying possible subcoordinates | |
336 see the readcount method for further discussion. size_scope must be an interable''' | |
337 upstream_coord = upstream_coord or self.windowoffset | |
338 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
339 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0} | |
340 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
341 for offset in self.readDict: | |
342 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
343 for size in self.readDict[offset]: | |
344 if size in size_scope: | |
345 startbase = self.sequence[abs(offset)-self.windowoffset] | |
346 if offset < 0: | |
347 startbase = convertDic[startbase] | |
348 freqDic[startbase] += 1 | |
349 base_sum = float ( sum( freqDic.values()) ) | |
350 if base_sum == 0: | |
351 return "." | |
352 else: | |
353 return freqDic["T"] / base_sum * 100 | |
354 | |
355 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None): | |
356 ''' migration to memory saving by specifying possible subcoordinates | |
357 see the readcount method for further discussion. size_scope must be an interable | |
358 This method is similar to the Ufreq method but take strandness into account''' | |
359 upstream_coord = upstream_coord or self.windowoffset | |
360 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
361 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0} | |
362 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
363 for offset in self.readDict: | |
364 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
365 for size in self.readDict[offset]: | |
366 if size in size_scope: | |
367 startbase = self.sequence[abs(offset)-self.windowoffset] | |
368 if offset < 0: | |
369 startbase = convertDic[startbase] | |
370 freqDic[startbase+"rev"] += 1 | |
371 else: | |
372 freqDic[startbase+"for"] += 1 | |
373 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"]) | |
374 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"]) | |
375 if forward_sum == 0 and reverse_sum == 0: | |
376 return ". | ." | |
377 elif reverse_sum == 0: | |
378 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100) | |
379 elif forward_sum == 0: | |
380 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100) | |
381 else: | |
382 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100) | |
383 | |
384 | |
385 def readplot (self): | |
386 readmap = {} | |
387 for offset in self.readDict.keys(): | |
388 readmap[abs(offset)] = ( len(self.readDict[-abs(offset)]) , len(self.readDict[abs(offset)]) ) | |
389 mylist = [] | |
390 for offset in sorted(readmap): | |
391 if readmap[offset][1] != 0: | |
392 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") ) | |
393 if readmap[offset][0] != 0: | |
394 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") ) | |
395 return mylist | |
396 | |
397 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None): | |
398 '''Use by MirParser tool''' | |
399 upstream_coord = upstream_coord or 1 | |
400 downstream_coord = downstream_coord or self.size | |
401 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord) | |
402 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)]) | |
403 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward") | |
404 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse") | |
405 if totalforward > totalreverse: | |
406 majorcoverage = "forward" | |
407 for offset in self.readDict.keys(): | |
408 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ): | |
409 for read in self.readDict[offset]: | |
410 for i in xrange(read): | |
411 try: | |
412 forORrev_coverage[offset-upstream_coord+1+i] += 1 | |
413 except KeyError: | |
414 continue # a sense read may span over the downstream limit | |
415 else: | |
416 majorcoverage = "reverse" | |
417 for offset in self.readDict.keys(): | |
418 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ): | |
419 for read in self.readDict[offset]: | |
420 for i in xrange(read): | |
421 try: | |
422 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage | |
423 except KeyError: | |
424 continue # an antisense read may span over the upstream limit | |
425 output_list = [] | |
426 maximum = max (forORrev_coverage.values()) or 1 | |
427 for n in sorted (forORrev_coverage): | |
428 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage)) | |
429 return "\n".join(output_list) | |
430 | |
431 | |
432 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None): | |
433 ''' migration to memory saving by specifying possible subcoordinates | |
434 see the readcount method for further discussion | |
435 scope must be a python iterable; scope define the *relative* offset range to be computed''' | |
436 upstream_coord = upstream_coord or self.windowoffset | |
437 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
438 query_range = range (minquery, maxquery+1) | |
439 target_range = range (mintarget, maxtarget+1) | |
440 Query_table = {} | |
441 Target_table = {} | |
442 frequency_table = dict ([(i, 0) for i in scope]) | |
443 for offset in self.readDict: | |
444 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
445 for size in self.readDict[offset]: | |
446 if size in query_range: | |
447 Query_table[offset] = Query_table.get(offset, 0) + 1 | |
448 if size in target_range: | |
449 Target_table[offset] = Target_table.get(offset, 0) + 1 | |
450 for offset in Query_table: | |
451 for i in scope: | |
452 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0)) | |
453 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py | |
454 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table]) | |
455 if zscore == "yes": | |
456 z_mean = mean(frequency_table.values() ) | |
457 z_std = std(frequency_table.values() ) | |
458 if z_std == 0: | |
459 frequency_table = dict([(i,0) for i in frequency_table] ) | |
460 else: | |
461 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] ) | |
462 return frequency_table | |
463 | |
464 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None): | |
465 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion | |
466 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed''' | |
467 upstream_coord = upstream_coord or self.windowoffset | |
468 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
469 query_range = range (minquery, maxquery+1) | |
470 target_range = range (mintarget, maxtarget+1) | |
471 Query_table = {} | |
472 Target_table = {} | |
473 Total_Query_Numb = 0 | |
474 general_frequency_table = dict ([(i,0) for i in scope]) | |
475 ## filtering the appropriate reads for the study | |
476 for offset in self.readDict: | |
477 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
478 for size in self.readDict[offset]: | |
479 if size in query_range: | |
480 Query_table[offset] = Query_table.get(offset, 0) + 1 | |
481 Total_Query_Numb += 1 | |
482 if size in target_range: | |
483 Target_table[offset] = Target_table.get(offset, 0) + 1 | |
484 for offset in Query_table: | |
485 frequency_table = dict ([(i,0) for i in scope]) | |
486 number_of_targets = 0 | |
487 for i in scope: | |
488 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0) | |
489 number_of_targets += Target_table.get(-offset -i +1, 0) | |
490 for i in scope: | |
491 try: | |
492 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i] | |
493 except ZeroDivisionError : | |
494 continue | |
495 return general_frequency_table | |
496 | |
497 def phasing (self, size_range, scope): | |
498 ''' to calculate autocorelation like signal - scope must be an python iterable''' | |
499 read_table = {} | |
500 total_read_number = 0 | |
501 general_frequency_table = dict ([(i, 0) for i in scope]) | |
502 ## read input filtering | |
503 for offset in self.readDict: | |
504 for size in self.readDict[offset]: | |
505 if size in size_range: | |
506 read_table[offset] = read_table.get(offset, 0) + 1 | |
507 total_read_number += 1 | |
508 ## per offset read phasing computing | |
509 for offset in read_table: | |
510 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table | |
511 number_of_targets = 0 | |
512 for i in scope: | |
513 if offset > 0: | |
514 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0) | |
515 number_of_targets += read_table.get(offset + i, 0) | |
516 else: | |
517 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0) | |
518 number_of_targets += read_table.get(offset - i, 0) | |
519 ## inclusion of local frequency table in the general frequency table (all offsets average) | |
520 for i in scope: | |
521 try: | |
522 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i] | |
523 except ZeroDivisionError : | |
524 continue | |
525 return general_frequency_table | |
526 | |
527 | |
528 | |
529 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | |
530 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute''' | |
531 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | |
532 z_table = {} | |
533 frequency_list = [frequency_table[i] for i in sorted (frequency_table)] | |
534 if std(frequency_list): | |
535 meanlist = mean(frequency_list) | |
536 stdlist = std(frequency_list) | |
537 z_list = [(i-meanlist)/stdlist for i in frequency_list] | |
538 return dict (zip (sorted(frequency_table), z_list) ) | |
539 else: | |
540 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) ) | |
541 | |
542 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | |
543 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | |
544 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) ) | |
545 if total == 0: | |
546 return dict( [(i,0) for i in scope]) | |
547 return dict( [(i, frequency_table[i]/total*100) for i in scope]) | |
548 | |
549 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
550 queryhash = defaultdict(list) | |
551 targethash = defaultdict(list) | |
552 query_range = range (int(minquery), int(maxquery)+1) | |
553 target_range = range (int(mintarget), int(maxtarget)+1) | |
554 paired_sequences = [] | |
555 for offset in self.readDict: # selection of data | |
556 for size in self.readDict[offset]: | |
557 if size in query_range: | |
558 queryhash[offset].append(size) | |
559 if size in target_range: | |
560 targethash[offset].append(size) | |
561 for offset in queryhash: | |
562 if offset >= 0: matched_offset = -offset - overlap + 1 | |
563 else: matched_offset = -offset - overlap + 1 | |
564 if targethash[matched_offset]: | |
565 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) ) | |
566 if offset >= 0: | |
567 for i in range (paired): | |
568 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) ) | |
569 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) ) | |
570 if offset < 0: | |
571 for i in range (paired): | |
572 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) ) | |
573 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) ) | |
574 return paired_sequences | |
575 | |
576 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
577 queryhash = defaultdict(list) | |
578 targethash = defaultdict(list) | |
579 query_range = range (int(minquery), int(maxquery)+1) | |
580 target_range = range (int(mintarget), int(maxtarget)+1) | |
581 paired_sequences = [] | |
582 | |
583 for offset in self.readDict: # selection of data | |
584 for size in self.readDict[offset]: | |
585 if size in query_range: | |
586 queryhash[offset].append(size) | |
587 if size in target_range: | |
588 targethash[offset].append(size) | |
589 | |
590 for offset in queryhash: | |
591 matched_offset = -offset - overlap + 1 | |
592 if targethash[matched_offset]: | |
593 if offset >= 0: | |
594 for i in queryhash[offset]: | |
595 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) ) | |
596 for i in targethash[matched_offset]: | |
597 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) ) | |
598 if offset < 0: | |
599 for i in queryhash[offset]: | |
600 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) ) | |
601 for i in targethash[matched_offset]: | |
602 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) ) | |
603 return paired_sequences | |
604 | |
605 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
606 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates''' | |
607 queryhash = defaultdict(list) | |
608 targethash = defaultdict(list) | |
609 query_range = range (int(minquery), int(maxquery)+1) | |
610 target_range = range (int(mintarget), int(maxtarget)+1) | |
611 bowtie_output = [] | |
612 | |
613 for offset in self.readDict: # selection of data | |
614 for size in self.readDict[offset]: | |
615 if size in query_range: | |
616 queryhash[offset].append(size) | |
617 if size in target_range: | |
618 targethash[offset].append(size) | |
619 counter = 0 | |
620 for offset in queryhash: | |
621 matched_offset = -offset - overlap + 1 | |
622 if targethash[matched_offset]: | |
623 if offset >= 0: | |
624 for i in queryhash[offset]: | |
625 counter += 1 | |
626 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset | |
627 if offset < 0: | |
628 for i in queryhash[offset]: | |
629 counter += 1 | |
630 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset | |
631 return bowtie_output | |
632 | |
633 | |
634 def __main__(bowtie_index_path, bowtie_output_path): | |
635 sequenceDic = get_fasta (bowtie_index_path) | |
636 objDic = {} | |
637 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input | |
638 for line in F: | |
639 fields = line.split() | |
640 polarity = fields[1] | |
641 gene = fields[2] | |
642 offset = int(fields[3]) | |
643 size = len (fields[4]) | |
644 try: | |
645 objDic[gene].addread (polarity, offset, size) | |
646 except KeyError: | |
647 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene]) | |
648 objDic[gene].addread (polarity, offset, size) | |
649 F.close() | |
650 for gene in objDic: | |
651 print gene, objDic[gene].pairer(19,19,23,19,23) | |
652 | |
653 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2]) |