comparison mutation_analysis.py @ 31:c623690e3b81 draft

Uploaded
author davidvanzessen
date Thu, 09 Apr 2015 08:52:18 -0400
parents 7e44617c9ca4
children 2a7343e4be5a
comparison
equal deleted inserted replaced
30:7e44617c9ca4 31:c623690e3b81
2 import argparse 2 import argparse
3 3
4 parser = argparse.ArgumentParser() 4 parser = argparse.ArgumentParser()
5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") 5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
6 parser.add_argument("--genes", help="The genes available in the 'best_match' column") 6 parser.add_argument("--genes", help="The genes available in the 'best_match' column")
7 parser.add_argument("--includefr1", help="The genes available in the 'best_match' column")
7 parser.add_argument("--output", help="Output file") 8 parser.add_argument("--output", help="Output file")
8 9
9 args = parser.parse_args() 10 args = parser.parse_args()
10 11
11 infile = args.input 12 infile = args.input
12 genes = str(args.genes).split(",") 13 genes = str(args.genes).split(",")
14 include_fr1 = True if args.includefr1 == "yes" else False
13 outfile = args.output 15 outfile = args.output
14 16
15 genedic = dict() 17 genedic = dict()
16 18
17 mutationdic = dict() 19 mutationdic = dict()
44 continue 46 continue
45 linecount += 1 47 linecount += 1
46 linesplt = line.split("\t") 48 linesplt = line.split("\t")
47 ID = linesplt[IDIndex] 49 ID = linesplt[IDIndex]
48 genedic[ID] = linesplt[best_matchIndex] 50 genedic[ID] = linesplt[best_matchIndex]
49 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] 51 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
50 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] 52 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
51 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] 53 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
52 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] 54 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
53 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] 55 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
54 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] 56 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
106 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] 108 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
107 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] 109 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
108 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] 110 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
109 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] 111 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
110 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 112 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0
111 for (x,y,z) in RGYW: #RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
112 if not z or z == "CDR3":
113 continue
114 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
115 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW])
116 if mutations_in_motif > 0:
117 RGYWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
118 if in_other_motifs > 1:
119 print in_other_motifs, ID, "RGYW", x, y, ([(x,y,z) for (xother, yother, zother) in WRCY if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in WA if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in TW if ((x <= int(xother) <= y) and (x <= int(yother) <= y))])
120 113
121 for (x,y,z) in WRCY: #WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) 114 mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
122 if not z or z == "CDR3": 115 for mutation in mutationList:
123 continue 116 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
124 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) 117 mutation_in_RGYW = any([(start <= int(where) <= end) for (start,end,region) in RGYW])
125 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) 118 mutation_in_WRCY = any([(start <= int(where) <= end) for (start,end,region) in WRCY])
126 if mutations_in_motif > 0: 119 mutation_in_WA = any([(start <= int(where) <= end) for (start,end,region) in WA])
127 WRCYCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 120 mutation_in_TW = any([(start <= int(where) <= end) for (start,end,region) in TW])
128 if in_other_motifs > 1: 121
129 print in_other_motifs, ID, "WRCY", x, y 122 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
130 123
131 for (x,y,z) in WA: #WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) 124 if in_how_many_motifs > 0:
132 if not z or z == "CDR3": 125 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs
133 continue 126 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
134 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) 127 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
135 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) 128 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
136 if mutations_in_motif > 0:
137 WACount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
138 if in_other_motifs > 1:
139 print in_other_motifs, ID, "WA", x, y
140
141 for (x,y,z) in TW: #TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
142 if not z or z == "CDR3":
143 continue
144 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
145 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA])
146 if mutations_in_motif > 0:
147 TWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
148 if in_other_motifs > 1:
149 print in_other_motifs, ID, "TW", x, y
150
151 129
152 directory = outfile[:outfile.rfind("/") + 1] 130 directory = outfile[:outfile.rfind("/") + 1]
153 value = 0 131 value = 0
154 valuedic = dict() 132 valuedic = dict()
155 for gene in genes: 133 for gene in genes:
187 for ID in IDlist: 165 for ID in IDlist:
188 if first: 166 if first:
189 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") 167 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n")
190 first = False 168 first = False
191 continue 169 continue
192 o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") 170 o.write(ID + "\t" + str(int(RGYWCount[ID])) + "\t" + str(int(WRCYCount[ID])) + "\t" + str(int(WACount[ID])) + "\t" + str(int(TWCount[ID])) + "\n")