| 53 | 1 from __future__ import division | 
| 49 | 2 from collections import defaultdict | 
| 0 | 3 import re | 
|  | 4 import argparse | 
|  | 5 | 
|  | 6 parser = argparse.ArgumentParser() | 
| 39 | 7 parser.add_argument("--input", | 
| 49 | 8 					help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") | 
| 4 | 9 parser.add_argument("--genes", help="The genes available in the 'best_match' column") | 
| 73 | 10 parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?") | 
| 0 | 11 parser.add_argument("--output", help="Output file") | 
|  | 12 | 
|  | 13 args = parser.parse_args() | 
|  | 14 | 
| 4 | 15 infile = args.input | 
|  | 16 genes = str(args.genes).split(",") | 
| 32 | 17 print "includefr1 =", args.includefr1 | 
| 31 | 18 include_fr1 = True if args.includefr1 == "yes" else False | 
| 39 | 19 outfile = args.output | 
| 4 | 20 | 
|  | 21 genedic = dict() | 
| 0 | 22 | 
|  | 23 mutationdic = dict() | 
|  | 24 mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") | 
|  | 25 linecount = 0 | 
|  | 26 | 
| 4 | 27 IDIndex = 0 | 
|  | 28 best_matchIndex = 0 | 
|  | 29 fr1Index = 0 | 
|  | 30 cdr1Index = 0 | 
|  | 31 fr2Index = 0 | 
|  | 32 cdr2Index = 0 | 
|  | 33 fr3Index = 0 | 
| 39 | 34 first = True | 
| 26 | 35 IDlist = [] | 
|  | 36 mutationList = [] | 
| 43 | 37 mutationListByID = {} | 
| 49 | 38 cdr1LengthDic = {} | 
|  | 39 cdr2LengthDic = {} | 
| 26 | 40 | 
| 4 | 41 with open(infile, 'r') as i: | 
| 43 | 42 	for line in i: | 
|  | 43 		if first: | 
|  | 44 			linesplt = line.split("\t") | 
|  | 45 			IDIndex = linesplt.index("Sequence.ID") | 
|  | 46 			best_matchIndex = linesplt.index("best_match") | 
|  | 47 			fr1Index = linesplt.index("FR1.IMGT") | 
|  | 48 			cdr1Index = linesplt.index("CDR1.IMGT") | 
|  | 49 			fr2Index = linesplt.index("FR2.IMGT") | 
|  | 50 			cdr2Index = linesplt.index("CDR2.IMGT") | 
|  | 51 			fr3Index = linesplt.index("FR3.IMGT") | 
| 85 | 52 			cdr1LengthIndex = linesplt.index("CDR1.IMGT.length") | 
|  | 53 			cdr2LengthIndex = linesplt.index("CDR2.IMGT.length") | 
| 43 | 54 			first = False | 
|  | 55 			continue | 
|  | 56 		linecount += 1 | 
|  | 57 		linesplt = line.split("\t") | 
|  | 58 		ID = linesplt[IDIndex] | 
|  | 59 		genedic[ID] = linesplt[best_matchIndex] | 
| 63 | 60 		try: | 
|  | 61 			mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] | 
|  | 62 			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] | 
|  | 63 			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] | 
|  | 64 			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] | 
|  | 65 			mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | 
|  | 66 			mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] | 
|  | 67 		except: | 
|  | 68 			print linesplt | 
|  | 69 			print linecount | 
| 43 | 70 		mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 
|  | 71 		mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 
| 26 | 72 | 
| 49 | 73 		cdr1Length = linesplt[cdr1LengthIndex] | 
|  | 74 		cdr2Length = linesplt[cdr2LengthIndex] | 
|  | 75 | 
| 85 | 76 		cdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0 | 
|  | 77 		cdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0 | 
|  | 78 | 
| 43 | 79 		IDlist += [ID] | 
|  | 80 | 
| 49 | 81 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent | 
|  | 82 | 
|  | 83 AA_mutation = [0] * AALength | 
| 104 | 84 AA_mutation_dic = {"ca": AA_mutation[:], "cg": AA_mutation[:], "cm": AA_mutation[:], "un": AA_mutation[:]} | 
| 43 | 85 AA_mutation_empty = AA_mutation[:] | 
| 39 | 86 | 
| 43 | 87 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" | 
|  | 88 with open(aa_mutations_by_id_file, 'w') as o: | 
|  | 89 	for ID in mutationListByID.keys(): | 
|  | 90 		AA_mutation_for_ID = AA_mutation_empty[:] | 
|  | 91 		for mutation in mutationListByID[ID]: | 
|  | 92 			if mutation[4]: | 
| 104 | 93 				AA_mutation_position = int(mutation[4]) | 
|  | 94 				AA_mutation[AA_mutation_position] += 1 | 
|  | 95 				AA_mutation_for_ID[AA_mutation_position] += 1 | 
|  | 96 				clss = genedic[ID][:2] | 
|  | 97 				AA_mutation_dic[clss][AA_mutation_position] += 1 | 
| 49 | 98 		o.write(ID + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") | 
| 26 | 99 | 
| 43 | 100 | 
|  | 101 | 
| 49 | 102 #absent AA stuff | 
|  | 103 absentAACDR1Dic = defaultdict(list) | 
|  | 104 absentAACDR1Dic[5] = range(29,36) | 
|  | 105 absentAACDR1Dic[6] = range(29,35) | 
|  | 106 absentAACDR1Dic[7] = range(30,35) | 
|  | 107 absentAACDR1Dic[8] = range(30,34) | 
|  | 108 absentAACDR1Dic[9] = range(31,34) | 
|  | 109 absentAACDR1Dic[10] = range(31,33) | 
|  | 110 absentAACDR1Dic[11] = [32] | 
|  | 111 | 
|  | 112 absentAACDR2Dic = defaultdict(list) | 
|  | 113 absentAACDR2Dic[0] = range(55,65) | 
|  | 114 absentAACDR2Dic[1] = range(56,65) | 
|  | 115 absentAACDR2Dic[2] = range(56,64) | 
|  | 116 absentAACDR2Dic[3] = range(57,64) | 
|  | 117 absentAACDR2Dic[4] = range(57,63) | 
|  | 118 absentAACDR2Dic[5] = range(58,63) | 
|  | 119 absentAACDR2Dic[6] = range(58,62) | 
|  | 120 absentAACDR2Dic[7] = range(59,62) | 
|  | 121 absentAACDR2Dic[8] = range(59,61) | 
|  | 122 absentAACDR2Dic[9] = [60] | 
|  | 123 | 
|  | 124 absentAA = [len(IDlist)] * (AALength-1) | 
|  | 125 for k, cdr1Length in cdr1LengthDic.iteritems(): | 
|  | 126 	for c in absentAACDR1Dic[cdr1Length]: | 
|  | 127 		absentAA[c] -= 1 | 
|  | 128 | 
|  | 129 for k, cdr2Length in cdr2LengthDic.iteritems(): | 
|  | 130 	for c in absentAACDR2Dic[cdr2Length]: | 
|  | 131 		absentAA[c] -= 1 | 
|  | 132 | 
|  | 133 | 
|  | 134 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" | 
|  | 135 with open(aa_mutations_by_id_file, 'w') as o: | 
|  | 136 	o.write("ID\tcdr1length\tcdr2length\t" + "\t".join([str(x) for x in range(1,AALength-1)]) + "\n") | 
|  | 137 	for ID in IDlist: | 
|  | 138 		absentAAbyID = [1] * (AALength-1) | 
|  | 139 		cdr1Length = cdr1LengthDic[ID] | 
|  | 140 		for c in absentAACDR1Dic[cdr1Length]: | 
|  | 141 			absentAAbyID[c] -= 1 | 
|  | 142 | 
|  | 143 		cdr2Length = cdr2LengthDic[ID] | 
|  | 144 		for c in absentAACDR2Dic[cdr2Length]: | 
|  | 145 			absentAAbyID[c] -= 1 | 
|  | 146 		o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") | 
|  | 147 | 
|  | 148 | 
| 26 | 149 | 
|  | 150 aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt" | 
|  | 151 with open(aa_mutations_file, 'w') as o: | 
| 49 | 152 	o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") | 
|  | 153 	o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation[1:]]) + "\n") | 
|  | 154 	o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") | 
| 39 | 155 | 
| 104 | 156 | 
|  | 157 aa_mutations_file_ca = outfile[:outfile.rindex("/")] + "/aa_mutations_ca.txt" | 
|  | 158 with open(aa_mutations_file_ca, 'w') as o: | 
|  | 159 	o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") | 
|  | 160 	o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation_dic["ca"][1:]]) + "\n") | 
|  | 161 	o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") | 
|  | 162 | 
|  | 163 | 
|  | 164 aa_mutations_file_cg = outfile[:outfile.rindex("/")] + "/aa_mutations_cg.txt" | 
|  | 165 with open(aa_mutations_file_cg, 'w') as o: | 
|  | 166 	o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") | 
|  | 167 	o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation_dic["cg"][1:]]) + "\n") | 
|  | 168 	o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") | 
|  | 169 | 
|  | 170 | 
|  | 171 aa_mutations_file_cm = outfile[:outfile.rindex("/")] + "/aa_mutations_cm.txt" | 
|  | 172 with open(aa_mutations_file_cm, 'w') as o: | 
|  | 173 	o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") | 
|  | 174 	o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation_dic["cg"][1:]]) + "\n") | 
|  | 175 	o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") | 
|  | 176 | 
| 0 | 177 if linecount == 0: | 
| 49 | 178 	print "No data, exiting" | 
|  | 179 	with open(outfile, 'w') as o: | 
|  | 180 		o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) | 
|  | 181 		o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) | 
|  | 182 		o.write("WA (%)," + ("0,0,0\n" * len(genes))) | 
|  | 183 		o.write("TW (%)," + ("0,0,0\n" * len(genes))) | 
|  | 184 	import sys | 
| 39 | 185 | 
| 49 | 186 	sys.exit() | 
| 0 | 187 | 
|  | 188 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") | 
| 98 | 189 RGYWCount = {} | 
|  | 190 WRCYCount = {} | 
|  | 191 WACount = {} | 
|  | 192 TWCount = {} | 
| 0 | 193 | 
| 98 | 194 #IDIndex = 0 | 
| 4 | 195 ataIndex = 0 | 
|  | 196 tatIndex = 0 | 
|  | 197 aggctatIndex = 0 | 
|  | 198 atagcctIndex = 0 | 
|  | 199 first = True | 
|  | 200 with open(infile, 'r') as i: | 
| 49 | 201 	for line in i: | 
|  | 202 		if first: | 
|  | 203 			linesplt = line.split("\t") | 
|  | 204 			ataIndex = linesplt.index("X.a.t.a") | 
|  | 205 			tatIndex = linesplt.index("t.a.t.") | 
|  | 206 			aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") | 
|  | 207 			atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") | 
|  | 208 			first = False | 
|  | 209 			continue | 
|  | 210 		linesplt = line.split("\t") | 
|  | 211 		gene = linesplt[best_matchIndex] | 
|  | 212 		ID = linesplt[IDIndex] | 
| 98 | 213 		if ID == "ca2": | 
|  | 214 			print linesplt | 
| 49 | 215 		RGYW = [(int(x), int(y), z) for (x, y, z) in | 
|  | 216 				[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] | 
|  | 217 		WRCY = [(int(x), int(y), z) for (x, y, z) in | 
|  | 218 				[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] | 
|  | 219 		WA = [(int(x), int(y), z) for (x, y, z) in | 
|  | 220 			  [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] | 
|  | 221 		TW = [(int(x), int(y), z) for (x, y, z) in | 
|  | 222 			  [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] | 
|  | 223 		RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 | 
| 39 | 224 | 
| 49 | 225 		mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ | 
|  | 226 			ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 
|  | 227 		for mutation in mutationList: | 
|  | 228 			frm, where, to, AAfrm, AAwhere, AAto, junk = mutation | 
|  | 229 			mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) | 
|  | 230 			mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) | 
|  | 231 			mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) | 
|  | 232 			mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) | 
| 39 | 233 | 
| 49 | 234 			in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) | 
| 39 | 235 | 
| 49 | 236 			if in_how_many_motifs > 0: | 
|  | 237 				RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs | 
|  | 238 				WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs | 
|  | 239 				WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs | 
|  | 240 				TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs | 
| 0 | 241 | 
| 53 | 242 | 
|  | 243 def mean(lst): | 
|  | 244 	return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 | 
|  | 245 | 
|  | 246 | 
|  | 247 def median(lst): | 
|  | 248 	lst = sorted(lst) | 
|  | 249 	l = len(lst) | 
|  | 250 	if l == 0: | 
|  | 251 		return 0 | 
|  | 252 	if l == 1: | 
|  | 253 		return lst[0] | 
|  | 254 | 
|  | 255 	l = int(l / 2) | 
|  | 256 | 
|  | 257 	if len(lst) % 2 == 0: | 
|  | 258 		return float(lst[l] + lst[(l - 1)]) / 2.0 | 
|  | 259 	else: | 
|  | 260 		return lst[l] | 
|  | 261 | 
|  | 262 funcs = {"mean": mean, "median": median, "sum": sum} | 
|  | 263 | 
| 4 | 264 directory = outfile[:outfile.rfind("/") + 1] | 
| 22 | 265 value = 0 | 
| 4 | 266 valuedic = dict() | 
| 53 | 267 | 
|  | 268 for fname in funcs.keys(): | 
|  | 269 	for gene in genes: | 
|  | 270 		with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: | 
|  | 271 			valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) | 
|  | 272 	with open(directory + "all_" + fname + "_value.txt", 'r') as v: | 
|  | 273 		valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) | 
|  | 274 | 
| 78 | 275 | 
| 53 | 276 def get_xyz(lst, gene, f, fname): | 
|  | 277 	x = int(round(f(lst))) | 
|  | 278 	y = valuedic[gene + "_" + fname] | 
| 98 | 279 	z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" | 
| 53 | 280 	return (str(x), str(y), z) | 
| 4 | 281 | 
|  | 282 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} | 
|  | 283 arr = ["RGYW", "WRCY", "WA", "TW"] | 
| 53 | 284 | 
| 98 | 285 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} | 
|  | 286 | 
| 53 | 287 for fname in funcs.keys(): | 
|  | 288 	func = funcs[fname] | 
|  | 289 	foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" | 
|  | 290 	with open(foutfile, 'w') as o: | 
|  | 291 		for typ in arr: | 
|  | 292 			o.write(typ + " (%)") | 
|  | 293 			curr = dic[typ] | 
|  | 294 			for gene in genes: | 
| 98 | 295 				geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... | 
| 53 | 296 				if valuedic[gene + "_" + fname] is 0: | 
|  | 297 					o.write(",0,0,0") | 
|  | 298 				else: | 
|  | 299 					x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) | 
|  | 300 					o.write("," + x + "," + y + "," + z) | 
| 98 | 301 | 
|  | 302 			x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) | 
| 53 | 303 			o.write("," + x + "," + y + "," + z + "\n") | 
| 21 | 304 | 
|  | 305 | 
| 39 | 306 # for testing | 
| 21 | 307 seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" | 
|  | 308 with open(seq_motif_file, 'w') as o: | 
| 49 | 309 	o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") | 
|  | 310 	for ID in IDlist: | 
| 53 | 311 		o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") |