comparison mutation_analysis.py @ 53:7290a88ea202 draft

Uploaded
author davidvanzessen
date Mon, 29 Feb 2016 10:49:39 -0500
parents 5c6b9e99d576
children a7381fd96dad
comparison
equal deleted inserted replaced
52:d3542f87a304 53:7290a88ea202
1 from __future__ import division
1 from collections import defaultdict 2 from collections import defaultdict
2 import re 3 import re
3 import argparse 4 import argparse
4 5
5 parser = argparse.ArgumentParser() 6 parser = argparse.ArgumentParser()
54 continue 55 continue
55 linecount += 1 56 linecount += 1
56 linesplt = line.split("\t") 57 linesplt = line.split("\t")
57 ID = linesplt[IDIndex] 58 ID = linesplt[IDIndex]
58 genedic[ID] = linesplt[best_matchIndex] 59 genedic[ID] = linesplt[best_matchIndex]
59 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if 60 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
60 x] if include_fr1 else []
61 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] 61 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
62 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] 62 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
63 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] 63 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
64 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] 64 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
65 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] 65 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
207 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs 207 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs
208 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs 208 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
209 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs 209 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
210 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs 210 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
211 211
212
213 def mean(lst):
214 return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0
215
216
217 def median(lst):
218 lst = sorted(lst)
219 l = len(lst)
220 if l == 0:
221 return 0
222 if l == 1:
223 return lst[0]
224
225 l = int(l / 2)
226
227 if len(lst) % 2 == 0:
228 print "list length is", l
229 return float(lst[l] + lst[(l - 1)]) / 2.0
230 else:
231 return lst[l]
232
233 funcs = {"mean": mean, "median": median, "sum": sum}
234
212 directory = outfile[:outfile.rfind("/") + 1] 235 directory = outfile[:outfile.rfind("/") + 1]
213 value = 0 236 value = 0
214 valuedic = dict() 237 valuedic = dict()
215 for gene in genes: 238
216 with open(directory + gene + "_value.txt", 'r') as v: 239 for fname in funcs.keys():
217 valuedic[gene] = int(v.readlines()[0].rstrip()) 240 for gene in genes:
218 with open(directory + "total_value.txt", 'r') as v: 241 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
219 valuedic["total"] = int(v.readlines()[0].rstrip()) 242 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
243 with open(directory + "all_" + fname + "_value.txt", 'r') as v:
244 valuedic["total_" + fname] = float(v.readlines()[0].rstrip())
245
246 print valuedic
247
248 def get_xyz(lst, gene, f, fname):
249 x = int(round(f(lst)))
250 y = valuedic[gene + "_" + fname]
251 z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0"
252 return (str(x), str(y), z)
220 253
221 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} 254 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
222 arr = ["RGYW", "WRCY", "WA", "TW"] 255 arr = ["RGYW", "WRCY", "WA", "TW"]
223 with open(outfile, 'w') as o: 256
224 for typ in arr: 257 for fname in funcs.keys():
225 o.write(typ + " (%)") 258 func = funcs[fname]
226 curr = dic[typ] 259 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
227 for gene in genes: 260 with open(foutfile, 'w') as o:
228 geneMatcher = re.compile(".*" + gene + ".*") 261 for typ in arr:
229 if valuedic[gene] is 0: 262 o.write(typ + " (%)")
230 o.write(",0,0,0") 263 curr = dic[typ]
231 else: 264 for gene in genes:
232 x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) 265 geneMatcher = re.compile(".*" + gene + ".*")
233 y = valuedic[gene] 266 if valuedic[gene + "_" + fname] is 0:
234 z = str(round(x / float(valuedic[gene]) * 100, 1)) 267 o.write(",0,0,0")
235 o.write("," + str(x) + "," + str(y) + "," + z) 268 else:
236 # for total 269 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
237 x = int(round(sum([y for x, y in curr.iteritems()]))) 270 o.write("," + x + "," + y + "," + z)
238 y = valuedic["total"] 271 # for total
239 z = str(round(x / float(valuedic["total"]) * 100, 1)) 272 x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
240 o.write("," + str(x) + "," + str(y) + "," + z + "\n") 273 o.write("," + x + "," + y + "," + z + "\n")
241 274
242 275
243 # for testing 276 # for testing
244 seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" 277 seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"
245 with open(seq_motif_file, 'w') as o: 278 with open(seq_motif_file, 'w') as o:
246 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") 279 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n")
247 for ID in IDlist: 280 for ID in IDlist:
248 o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( 281 o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n")
249 round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n")