Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.py @ 31:c623690e3b81 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 09 Apr 2015 08:52:18 -0400 |
parents | 7e44617c9ca4 |
children | 2a7343e4be5a |
comparison
equal
deleted
inserted
replaced
30:7e44617c9ca4 | 31:c623690e3b81 |
---|---|
2 import argparse | 2 import argparse |
3 | 3 |
4 parser = argparse.ArgumentParser() | 4 parser = argparse.ArgumentParser() |
5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") | 5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") |
6 parser.add_argument("--genes", help="The genes available in the 'best_match' column") | 6 parser.add_argument("--genes", help="The genes available in the 'best_match' column") |
7 parser.add_argument("--includefr1", help="The genes available in the 'best_match' column") | |
7 parser.add_argument("--output", help="Output file") | 8 parser.add_argument("--output", help="Output file") |
8 | 9 |
9 args = parser.parse_args() | 10 args = parser.parse_args() |
10 | 11 |
11 infile = args.input | 12 infile = args.input |
12 genes = str(args.genes).split(",") | 13 genes = str(args.genes).split(",") |
14 include_fr1 = True if args.includefr1 == "yes" else False | |
13 outfile = args.output | 15 outfile = args.output |
14 | 16 |
15 genedic = dict() | 17 genedic = dict() |
16 | 18 |
17 mutationdic = dict() | 19 mutationdic = dict() |
44 continue | 46 continue |
45 linecount += 1 | 47 linecount += 1 |
46 linesplt = line.split("\t") | 48 linesplt = line.split("\t") |
47 ID = linesplt[IDIndex] | 49 ID = linesplt[IDIndex] |
48 genedic[ID] = linesplt[best_matchIndex] | 50 genedic[ID] = linesplt[best_matchIndex] |
49 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] | 51 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] |
50 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] | 52 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] |
51 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] | 53 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] |
52 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] | 54 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] |
53 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | 55 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] |
54 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] | 56 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] |
106 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] | 108 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] |
107 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] | 109 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] |
108 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] | 110 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] |
109 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] | 111 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] |
110 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 | 112 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 |
111 for (x,y,z) in RGYW: #RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | |
112 if not z or z == "CDR3": | |
113 continue | |
114 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | |
115 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | |
116 if mutations_in_motif > 0: | |
117 RGYWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
118 if in_other_motifs > 1: | |
119 print in_other_motifs, ID, "RGYW", x, y, ([(x,y,z) for (xother, yother, zother) in WRCY if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in WA if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in TW if ((x <= int(xother) <= y) and (x <= int(yother) <= y))]) | |
120 | 113 |
121 for (x,y,z) in WRCY: #WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | 114 mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
122 if not z or z == "CDR3": | 115 for mutation in mutationList: |
123 continue | 116 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
124 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | 117 mutation_in_RGYW = any([(start <= int(where) <= end) for (start,end,region) in RGYW]) |
125 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | 118 mutation_in_WRCY = any([(start <= int(where) <= end) for (start,end,region) in WRCY]) |
126 if mutations_in_motif > 0: | 119 mutation_in_WA = any([(start <= int(where) <= end) for (start,end,region) in WA]) |
127 WRCYCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | 120 mutation_in_TW = any([(start <= int(where) <= end) for (start,end,region) in TW]) |
128 if in_other_motifs > 1: | 121 |
129 print in_other_motifs, ID, "WRCY", x, y | 122 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) |
130 | 123 |
131 for (x,y,z) in WA: #WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | 124 if in_how_many_motifs > 0: |
132 if not z or z == "CDR3": | 125 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs |
133 continue | 126 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs |
134 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | 127 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs |
135 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | 128 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs |
136 if mutations_in_motif > 0: | |
137 WACount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
138 if in_other_motifs > 1: | |
139 print in_other_motifs, ID, "WA", x, y | |
140 | |
141 for (x,y,z) in TW: #TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | |
142 if not z or z == "CDR3": | |
143 continue | |
144 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | |
145 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) | |
146 if mutations_in_motif > 0: | |
147 TWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
148 if in_other_motifs > 1: | |
149 print in_other_motifs, ID, "TW", x, y | |
150 | |
151 | 129 |
152 directory = outfile[:outfile.rfind("/") + 1] | 130 directory = outfile[:outfile.rfind("/") + 1] |
153 value = 0 | 131 value = 0 |
154 valuedic = dict() | 132 valuedic = dict() |
155 for gene in genes: | 133 for gene in genes: |
187 for ID in IDlist: | 165 for ID in IDlist: |
188 if first: | 166 if first: |
189 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") | 167 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") |
190 first = False | 168 first = False |
191 continue | 169 continue |
192 o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") | 170 o.write(ID + "\t" + str(int(RGYWCount[ID])) + "\t" + str(int(WRCYCount[ID])) + "\t" + str(int(WACount[ID])) + "\t" + str(int(TWCount[ID])) + "\n") |