| 0 | 1 import re | 
|  | 2 import argparse | 
|  | 3 | 
|  | 4 | 
|  | 5 parser = argparse.ArgumentParser() | 
|  | 6 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") | 
|  | 7 parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") | 
|  | 8 | 
|  | 9 args = parser.parse_args() | 
|  | 10 | 
|  | 11 infile = args.input | 
|  | 12 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" | 
|  | 13 outdir = args.outdir | 
|  | 14 #outfile = "identified.txt" | 
|  | 15 | 
|  | 16 dic = dict() | 
|  | 17 total = 0 | 
|  | 18 | 
|  | 19 first = True | 
|  | 20 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence | 
|  | 21   for line in f: | 
|  | 22     total += 1 | 
|  | 23     if first: | 
|  | 24       first = False | 
|  | 25       continue | 
|  | 26     linesplt = line.split("\t") | 
|  | 27     if linesplt[2] == "No results": | 
|  | 28       continue | 
|  | 29     ID = linesplt[1] | 
|  | 30     seq = linesplt[28] | 
|  | 31     dic[ID] = seq | 
|  | 32 | 
|  | 33 #lambda/kappa reference sequence | 
|  | 34 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", | 
|  | 35                  "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", | 
|  | 36                  "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} | 
|  | 37 | 
|  | 38 compiledregex = {"ca": [], | 
|  | 39                  "cg": [], | 
|  | 40                  "cm": []} | 
|  | 41 | 
|  | 42 #lambda/kappa reference sequence variable nucleotides | 
|  | 43 ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'} | 
|  | 44 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} | 
|  | 45 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 
|  | 46 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} | 
|  | 47 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 
|  | 48 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} | 
|  | 49 | 
|  | 50 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap | 
|  | 51 chunklength = 8 | 
|  | 52 | 
|  | 53 #create the chunks of the reference sequence with regular expressions for the variable nucleotides | 
|  | 54 for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2): | 
|  | 55   pos = i | 
|  | 56   chunk = searchstrings["ca"][i:i+chunklength] | 
|  | 57   result = "" | 
|  | 58   varsInResult = 0 | 
|  | 59   for c in chunk: | 
|  | 60     if pos in ca1.keys(): | 
|  | 61       varsInResult += 1 | 
|  | 62       result += "[" + ca1[pos] + ca2[pos] + "]" | 
|  | 63     else: | 
|  | 64       result += c | 
|  | 65     pos += 1 | 
|  | 66   compiledregex["ca"].append((re.compile(result), varsInResult)) | 
|  | 67 | 
|  | 68 for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2): | 
|  | 69   pos = i | 
|  | 70   chunk = searchstrings["cg"][i:i+chunklength] | 
|  | 71   result = "" | 
|  | 72   varsInResult = 0 | 
|  | 73   for c in chunk: | 
|  | 74     if pos in cg1.keys(): | 
|  | 75       varsInResult += 1 | 
|  | 76       result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]" | 
|  | 77     else: | 
|  | 78       result += c | 
|  | 79     pos += 1 | 
|  | 80   compiledregex["cg"].append((re.compile(result), varsInResult)) | 
|  | 81 | 
|  | 82 for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): | 
|  | 83   compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) | 
|  | 84 | 
|  | 85 | 
|  | 86 | 
|  | 87 def removeAndReturnMaxIndex(x): #simplifies a list comprehension | 
|  | 88   m = max(x) | 
|  | 89   index = x.index(m) | 
|  | 90   x[index] = 0 | 
|  | 91   return index | 
|  | 92 | 
|  | 93 | 
|  | 94 start_location = dict() | 
|  | 95 hits = dict() | 
|  | 96 alltotal = 0 | 
|  | 97 for key in compiledregex.keys(): #for ca/cg/cm | 
|  | 98 	regularexpressions = compiledregex[key] #get the compiled regular expressions | 
|  | 99 	for ID in dic.keys()[0:]: #for every ID | 
|  | 100 		if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists | 
|  | 101 			hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} | 
|  | 102 		currentIDHits = hits[ID] | 
|  | 103 		seq = dic[ID] | 
|  | 104 		lastindex = 0 | 
|  | 105 		start = [0] * len(seq) | 
|  | 106 		for i, regexp in enumerate(regularexpressions): #for every regular expression | 
|  | 107 			regex, hasVar = regexp | 
|  | 108 			matches = regex.finditer(seq[lastindex:]) | 
|  | 109 			for match in matches: #for every match with the current regex, only uses the first hit | 
|  | 110 				lastindex += match.start() | 
|  | 111 				start[lastindex - chunklength / 2 * i] += 1 | 
|  | 112 				if hasVar: #if the regex has a variable nt in it | 
|  | 113 					chunkstart = chunklength / 2 * i #where in the reference does this chunk start | 
|  | 114 					chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end | 
|  | 115 					if key == "ca": #just calculate the variable nt score for 'ca', cheaper | 
|  | 116 						currentIDHits["ca1"] += len([1 for x in ca1 if chunkstart <= x < chunkend and ca1[x] == seq[lastindex + x - chunkstart]]) | 
|  | 117 						currentIDHits["ca2"] += len([1 for x in ca2 if chunkstart <= x < chunkend and ca2[x] == seq[lastindex + x - chunkstart]]) | 
|  | 118 					elif key == "cg": #just calculate the variable nt score for 'cg', cheaper | 
|  | 119 						currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]]) | 
|  | 120 						currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) | 
|  | 121 						currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) | 
|  | 122 						currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) | 
|  | 123 					else: #key == "cm" #no variable regions in 'cm' | 
|  | 124 						pass | 
|  | 125 				break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped | 
|  | 126 			else: #only runs if there were no hits | 
|  | 127 				continue | 
|  | 128 			#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] | 
|  | 129 			currentIDHits[key + "_hits"] += 1 | 
|  | 130 		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) | 
|  | 131 		#start_location[ID + "_" + key] = str(start.index(max(start))) | 
|  | 132 | 
|  | 133 | 
|  | 134 chunksInCA = len(compiledregex["ca"]) | 
|  | 135 chunksInCG = len(compiledregex["cg"]) | 
|  | 136 chunksInCM = len(compiledregex["cm"]) | 
|  | 137 requiredChunkPercentage = 0.7 | 
|  | 138 varsInCA = float(len(ca1.keys()) * 2) | 
|  | 139 varsInCG = float(len(cg1.keys()) * 2) + 1 | 
|  | 140 varsInCM = 0 | 
|  | 141 requiredVarPercentage = 0.7 | 
|  | 142 | 
|  | 143 ca = 0 | 
|  | 144 ca1 = 0 | 
|  | 145 ca2 = 0 | 
|  | 146 cg = 0 | 
|  | 147 cg1 = 0 | 
|  | 148 cg2 = 0 | 
|  | 149 cg3 = 0 | 
|  | 150 cg4 = 0 | 
|  | 151 cm = 0 | 
|  | 152 try: | 
|  | 153 	cafile = open(outdir + "/ca.txt", 'w') | 
|  | 154 	ca1file = open(outdir + "/ca1.txt", 'w') | 
|  | 155 	ca2file = open(outdir + "/ca2.txt", 'w') | 
|  | 156 	cgfile = open(outdir + "/cg.txt", 'w') | 
|  | 157 	cg1file = open(outdir + "/cg1.txt", 'w') | 
|  | 158 	cg2file = open(outdir + "/cg2.txt", 'w') | 
|  | 159 	cg3file = open(outdir + "/cg3.txt", 'w') | 
|  | 160 	cg4file = open(outdir + "/cg4.txt", 'w') | 
|  | 161 	cmfile = open(outdir + "/cm.txt", 'w') | 
|  | 162 	unmatchedfile = open(outdir + "/unmatched.txt", 'w') | 
|  | 163 	cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 164 	ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 165 	ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 166 	cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 167 	cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 168 	cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 169 	cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 170 	cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 171 	cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | 
|  | 172 	unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") | 
|  | 173 	for ID in hits.keys(): | 
|  | 174 		currentIDHits = hits[ID] | 
|  | 175 		possibleca = float(len(compiledregex["ca"])) | 
|  | 176 		possiblecg = float(len(compiledregex["cg"])) | 
|  | 177 		possiblecm = float(len(compiledregex["cm"])) | 
|  | 178 		cahits = currentIDHits["ca_hits"] | 
|  | 179 		cghits = currentIDHits["cg_hits"] | 
|  | 180 		cmhits = currentIDHits["cm_hits"] | 
|  | 181 		if cahits > cghits and cahits > cmhits: #its a ca gene | 
|  | 182 			if cahits <= int(chunksInCA * requiredChunkPercentage): | 
|  | 183 				unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") | 
|  | 184 				continue | 
|  | 185 			ca += 1 | 
|  | 186 			ca1hits = currentIDHits["ca1"] | 
|  | 187 			ca2hits = currentIDHits["ca2"] | 
|  | 188 			cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 
|  | 189 			if ca1hits > ca2hits: | 
|  | 190 				#print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" | 
|  | 191 				if ca1hits <= int(varsInCA * requiredVarPercentage): | 
|  | 192 					unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") | 
|  | 193 					continue | 
|  | 194 				ca1 += 1 | 
|  | 195 				ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 
|  | 196 			else: | 
|  | 197 				#print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" | 
|  | 198 				if ca2hits <= int(varsInCA * requiredVarPercentage): | 
|  | 199 					unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") | 
|  | 200 					continue | 
|  | 201 				ca2 += 1 | 
|  | 202 				ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 
|  | 203 		elif cghits > cahits and cghits > cmhits: #its a cg gene | 
|  | 204 			if cghits <= int(chunksInCG * requiredChunkPercentage): | 
|  | 205 				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") | 
|  | 206 				continue | 
|  | 207 			cg += 1 | 
|  | 208 			cg1hits = currentIDHits["cg1"] | 
|  | 209 			cg2hits = currentIDHits["cg2"] | 
|  | 210 			cg3hits = currentIDHits["cg3"] | 
|  | 211 			cg4hits = currentIDHits["cg4"] | 
|  | 212 			cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 
|  | 213 			if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene | 
|  | 214 				if cg1hits <= int(varsInCG * requiredVarPercentage): | 
|  | 215 					unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") | 
|  | 216 					continue | 
|  | 217 				cg1 += 1 | 
|  | 218 				cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 
|  | 219 			elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene | 
|  | 220 				if cg2hits <= int(varsInCG * requiredVarPercentage): | 
|  | 221 					unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") | 
|  | 222 					continue | 
|  | 223 				cg2 += 1 | 
|  | 224 				cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 
|  | 225 			elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene | 
|  | 226 				if cg3hits <= int(varsInCG * requiredVarPercentage): | 
|  | 227 					unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") | 
|  | 228 					continue | 
|  | 229 				cg3 += 1 | 
|  | 230 				cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 
|  | 231 			else: #cg4 gene | 
|  | 232 				if cg4hits <= int(varsInCG * requiredVarPercentage): | 
|  | 233 					unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") | 
|  | 234 					continue | 
|  | 235 				cg4 += 1 | 
|  | 236 				cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 
|  | 237 		else: #its a cm gene | 
|  | 238 			if cmhits <= int(chunksInCM * requiredChunkPercentage): | 
|  | 239 				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") | 
|  | 240 				continue | 
|  | 241 			cm += 1 | 
|  | 242 			cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") | 
|  | 243 finally: | 
|  | 244   cafile.close() | 
|  | 245   ca1file.close() | 
|  | 246   ca2file.close() | 
|  | 247   cgfile.close() | 
|  | 248   cg1file.close() | 
|  | 249   cg2file.close() | 
|  | 250   cg3file.close() | 
|  | 251   cg4file.close() | 
|  | 252   cmfile.close() | 
|  | 253   unmatchedfile.close() | 
|  | 254 | 
|  | 255 | 
|  | 256 #print ca,cg,cm,(ca+cg+cm) | 
|  | 257 | 
|  | 258 | 
|  | 259 | 
|  | 260 | 
|  | 261 | 
|  | 262 | 
|  | 263 | 
|  | 264 | 
|  | 265 |