Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.py @ 31:c623690e3b81 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 09 Apr 2015 08:52:18 -0400 |
| parents | 7e44617c9ca4 |
| children | 2a7343e4be5a |
comparison
equal
deleted
inserted
replaced
| 30:7e44617c9ca4 | 31:c623690e3b81 |
|---|---|
| 2 import argparse | 2 import argparse |
| 3 | 3 |
| 4 parser = argparse.ArgumentParser() | 4 parser = argparse.ArgumentParser() |
| 5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") | 5 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") |
| 6 parser.add_argument("--genes", help="The genes available in the 'best_match' column") | 6 parser.add_argument("--genes", help="The genes available in the 'best_match' column") |
| 7 parser.add_argument("--includefr1", help="The genes available in the 'best_match' column") | |
| 7 parser.add_argument("--output", help="Output file") | 8 parser.add_argument("--output", help="Output file") |
| 8 | 9 |
| 9 args = parser.parse_args() | 10 args = parser.parse_args() |
| 10 | 11 |
| 11 infile = args.input | 12 infile = args.input |
| 12 genes = str(args.genes).split(",") | 13 genes = str(args.genes).split(",") |
| 14 include_fr1 = True if args.includefr1 == "yes" else False | |
| 13 outfile = args.output | 15 outfile = args.output |
| 14 | 16 |
| 15 genedic = dict() | 17 genedic = dict() |
| 16 | 18 |
| 17 mutationdic = dict() | 19 mutationdic = dict() |
| 44 continue | 46 continue |
| 45 linecount += 1 | 47 linecount += 1 |
| 46 linesplt = line.split("\t") | 48 linesplt = line.split("\t") |
| 47 ID = linesplt[IDIndex] | 49 ID = linesplt[IDIndex] |
| 48 genedic[ID] = linesplt[best_matchIndex] | 50 genedic[ID] = linesplt[best_matchIndex] |
| 49 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] | 51 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] |
| 50 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] | 52 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] |
| 51 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] | 53 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] |
| 52 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] | 54 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] |
| 53 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | 55 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] |
| 54 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] | 56 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] |
| 106 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] | 108 RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] |
| 107 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] | 109 WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] |
| 108 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] | 110 WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] |
| 109 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] | 111 TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] |
| 110 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 | 112 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 |
| 111 for (x,y,z) in RGYW: #RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | |
| 112 if not z or z == "CDR3": | |
| 113 continue | |
| 114 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | |
| 115 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | |
| 116 if mutations_in_motif > 0: | |
| 117 RGYWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
| 118 if in_other_motifs > 1: | |
| 119 print in_other_motifs, ID, "RGYW", x, y, ([(x,y,z) for (xother, yother, zother) in WRCY if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in WA if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in TW if ((x <= int(xother) <= y) and (x <= int(yother) <= y))]) | |
| 120 | 113 |
| 121 for (x,y,z) in WRCY: #WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | 114 mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
| 122 if not z or z == "CDR3": | 115 for mutation in mutationList: |
| 123 continue | 116 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
| 124 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | 117 mutation_in_RGYW = any([(start <= int(where) <= end) for (start,end,region) in RGYW]) |
| 125 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | 118 mutation_in_WRCY = any([(start <= int(where) <= end) for (start,end,region) in WRCY]) |
| 126 if mutations_in_motif > 0: | 119 mutation_in_WA = any([(start <= int(where) <= end) for (start,end,region) in WA]) |
| 127 WRCYCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | 120 mutation_in_TW = any([(start <= int(where) <= end) for (start,end,region) in TW]) |
| 128 if in_other_motifs > 1: | 121 |
| 129 print in_other_motifs, ID, "WRCY", x, y | 122 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) |
| 130 | 123 |
| 131 for (x,y,z) in WA: #WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | 124 if in_how_many_motifs > 0: |
| 132 if not z or z == "CDR3": | 125 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs |
| 133 continue | 126 WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs |
| 134 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | 127 WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs |
| 135 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) | 128 TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs |
| 136 if mutations_in_motif > 0: | |
| 137 WACount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
| 138 if in_other_motifs > 1: | |
| 139 print in_other_motifs, ID, "WA", x, y | |
| 140 | |
| 141 for (x,y,z) in TW: #TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) | |
| 142 if not z or z == "CDR3": | |
| 143 continue | |
| 144 mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) | |
| 145 in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) | |
| 146 if mutations_in_motif > 0: | |
| 147 TWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 | |
| 148 if in_other_motifs > 1: | |
| 149 print in_other_motifs, ID, "TW", x, y | |
| 150 | |
| 151 | 129 |
| 152 directory = outfile[:outfile.rfind("/") + 1] | 130 directory = outfile[:outfile.rfind("/") + 1] |
| 153 value = 0 | 131 value = 0 |
| 154 valuedic = dict() | 132 valuedic = dict() |
| 155 for gene in genes: | 133 for gene in genes: |
| 187 for ID in IDlist: | 165 for ID in IDlist: |
| 188 if first: | 166 if first: |
| 189 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") | 167 o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") |
| 190 first = False | 168 first = False |
| 191 continue | 169 continue |
| 192 o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") | 170 o.write(ID + "\t" + str(int(RGYWCount[ID])) + "\t" + str(int(WRCYCount[ID])) + "\t" + str(int(WACount[ID])) + "\t" + str(int(TWCount[ID])) + "\n") |
