Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.py @ 53:7290a88ea202 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 29 Feb 2016 10:49:39 -0500 |
parents | 5c6b9e99d576 |
children | a7381fd96dad |
comparison
equal
deleted
inserted
replaced
52:d3542f87a304 | 53:7290a88ea202 |
---|---|
1 from __future__ import division | |
1 from collections import defaultdict | 2 from collections import defaultdict |
2 import re | 3 import re |
3 import argparse | 4 import argparse |
4 | 5 |
5 parser = argparse.ArgumentParser() | 6 parser = argparse.ArgumentParser() |
54 continue | 55 continue |
55 linecount += 1 | 56 linecount += 1 |
56 linesplt = line.split("\t") | 57 linesplt = line.split("\t") |
57 ID = linesplt[IDIndex] | 58 ID = linesplt[IDIndex] |
58 genedic[ID] = linesplt[best_matchIndex] | 59 genedic[ID] = linesplt[best_matchIndex] |
59 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if | 60 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] |
60 x] if include_fr1 else [] | |
61 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] | 61 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] |
62 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] | 62 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] |
63 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] | 63 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] |
64 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | 64 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] |
65 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] | 65 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] |
207 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs | 207 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs |
208 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs | 208 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs |
209 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs | 209 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs |
210 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs | 210 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs |
211 | 211 |
212 | |
213 def mean(lst): | |
214 return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 | |
215 | |
216 | |
217 def median(lst): | |
218 lst = sorted(lst) | |
219 l = len(lst) | |
220 if l == 0: | |
221 return 0 | |
222 if l == 1: | |
223 return lst[0] | |
224 | |
225 l = int(l / 2) | |
226 | |
227 if len(lst) % 2 == 0: | |
228 print "list length is", l | |
229 return float(lst[l] + lst[(l - 1)]) / 2.0 | |
230 else: | |
231 return lst[l] | |
232 | |
233 funcs = {"mean": mean, "median": median, "sum": sum} | |
234 | |
212 directory = outfile[:outfile.rfind("/") + 1] | 235 directory = outfile[:outfile.rfind("/") + 1] |
213 value = 0 | 236 value = 0 |
214 valuedic = dict() | 237 valuedic = dict() |
215 for gene in genes: | 238 |
216 with open(directory + gene + "_value.txt", 'r') as v: | 239 for fname in funcs.keys(): |
217 valuedic[gene] = int(v.readlines()[0].rstrip()) | 240 for gene in genes: |
218 with open(directory + "total_value.txt", 'r') as v: | 241 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: |
219 valuedic["total"] = int(v.readlines()[0].rstrip()) | 242 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) |
243 with open(directory + "all_" + fname + "_value.txt", 'r') as v: | |
244 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) | |
245 | |
246 print valuedic | |
247 | |
248 def get_xyz(lst, gene, f, fname): | |
249 x = int(round(f(lst))) | |
250 y = valuedic[gene + "_" + fname] | |
251 z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0" | |
252 return (str(x), str(y), z) | |
220 | 253 |
221 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} | 254 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} |
222 arr = ["RGYW", "WRCY", "WA", "TW"] | 255 arr = ["RGYW", "WRCY", "WA", "TW"] |
223 with open(outfile, 'w') as o: | 256 |
224 for typ in arr: | 257 for fname in funcs.keys(): |
225 o.write(typ + " (%)") | 258 func = funcs[fname] |
226 curr = dic[typ] | 259 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" |
227 for gene in genes: | 260 with open(foutfile, 'w') as o: |
228 geneMatcher = re.compile(".*" + gene + ".*") | 261 for typ in arr: |
229 if valuedic[gene] is 0: | 262 o.write(typ + " (%)") |
230 o.write(",0,0,0") | 263 curr = dic[typ] |
231 else: | 264 for gene in genes: |
232 x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) | 265 geneMatcher = re.compile(".*" + gene + ".*") |
233 y = valuedic[gene] | 266 if valuedic[gene + "_" + fname] is 0: |
234 z = str(round(x / float(valuedic[gene]) * 100, 1)) | 267 o.write(",0,0,0") |
235 o.write("," + str(x) + "," + str(y) + "," + z) | 268 else: |
236 # for total | 269 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) |
237 x = int(round(sum([y for x, y in curr.iteritems()]))) | 270 o.write("," + x + "," + y + "," + z) |
238 y = valuedic["total"] | 271 # for total |
239 z = str(round(x / float(valuedic["total"]) * 100, 1)) | 272 x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) |
240 o.write("," + str(x) + "," + str(y) + "," + z + "\n") | 273 o.write("," + x + "," + y + "," + z + "\n") |
241 | 274 |
242 | 275 |
243 # for testing | 276 # for testing |
244 seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" | 277 seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" |
245 with open(seq_motif_file, 'w') as o: | 278 with open(seq_motif_file, 'w') as o: |
246 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") | 279 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") |
247 for ID in IDlist: | 280 for ID in IDlist: |
248 o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( | 281 o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") |
249 round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") |