comparison gene_identification.py @ 4:069419cccba4 draft

Uploaded
author davidvanzessen
date Mon, 22 Sep 2014 10:19:36 -0400
parents 2f4298673519
children 71a12810eff3
comparison
equal deleted inserted replaced
3:a0b27058dcac 4:069419cccba4
3 import time 3 import time
4 starttime= int(time.time() * 1000) 4 starttime= int(time.time() * 1000)
5 5
6 parser = argparse.ArgumentParser() 6 parser = argparse.ArgumentParser()
7 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") 7 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")
8 parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") 8 parser.add_argument("--output", help="The annotated summary output file")
9 9
10 args = parser.parse_args() 10 args = parser.parse_args()
11 11
12 infile = args.input 12 infile = args.input
13 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" 13 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"
14 outdir = args.outdir 14 output = args.output
15 #outfile = "identified.txt" 15 #outfile = "identified.txt"
16 16
17 dic = dict() 17 dic = dict()
18 total = 0 18 total = 0
19 19
20
20 first = True 21 first = True
22 IDIndex = 0
23 seqIndex = 0
24
21 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence 25 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
22 for line in f: 26 for line in f:
23 total += 1 27 total += 1
24 if first: 28 if first:
25 first = False 29 linesplt = line.split("\t")
26 continue 30 IDIndex = linesplt.index("Sequence ID")
27 linesplt = line.split("\t") 31 seqIndex = linesplt.index("Sequence")
28 if linesplt[2] == "No results": 32 first = False
29 continue 33 continue
30 ID = linesplt[1] 34 linesplt = line.split("\t")
31 seq = linesplt[28] 35 ID = linesplt[IDIndex]
32 dic[ID] = seq 36 if len(linesplt) < 28: #weird rows without a sequence
37 dic[ID] = ""
38 else:
39 dic[ID] = linesplt[seqIndex]
33 40
34 #lambda/kappa reference sequence 41 #lambda/kappa reference sequence
35 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", 42 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag",
36 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", 43 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc",
37 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} 44 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"}
126 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped 133 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped
127 else: #only runs if there were no hits 134 else: #only runs if there were no hits
128 continue 135 continue
129 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] 136 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
130 currentIDHits[key + "_hits"] += 1 137 currentIDHits[key + "_hits"] += 1
131 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) 138 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1])
132 #start_location[ID + "_" + key] = str(start.index(max(start))) 139 #start_location[ID + "_" + key] = str(start.index(max(start)))
133 140
134 141
135 chunksInCA = len(compiledregex["ca"]) 142 chunksInCA = len(compiledregex["ca"])
136 chunksInCG = len(compiledregex["cg"]) 143 chunksInCG = len(compiledregex["cg"])
139 varsInCA = float(len(ca1.keys()) * 2) 146 varsInCA = float(len(ca1.keys()) * 2)
140 varsInCG = float(len(cg1.keys()) * 2) + 1 147 varsInCG = float(len(cg1.keys()) * 2) + 1
141 varsInCM = 0 148 varsInCM = 0
142 requiredVarPercentage = 0.7 149 requiredVarPercentage = 0.7
143 150
144 ca = 0 151
145 ca1 = 0 152 first = True
146 ca2 = 0 153 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
147 cg = 0 154 with open(output, 'w') as o:
148 cg1 = 0 155 for line in f:
149 cg2 = 0 156 total += 1
150 cg3 = 0 157 if first:
151 cg4 = 0 158 o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
152 cm = 0 159 first = False
153 try:
154 cafile = open(outdir + "/ca.txt", 'w')
155 ca1file = open(outdir + "/ca1.txt", 'w')
156 ca2file = open(outdir + "/ca2.txt", 'w')
157 cgfile = open(outdir + "/cg.txt", 'w')
158 cg1file = open(outdir + "/cg1.txt", 'w')
159 cg2file = open(outdir + "/cg2.txt", 'w')
160 cg3file = open(outdir + "/cg3.txt", 'w')
161 cg4file = open(outdir + "/cg4.txt", 'w')
162 cmfile = open(outdir + "/cm.txt", 'w')
163 unmatchedfile = open(outdir + "/unmatched.txt", 'w')
164 cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
165 ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
166 ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
167 cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
168 cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
169 cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
170 cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
171 cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
172 cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
173 unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n")
174 for ID in hits.keys():
175 currentIDHits = hits[ID]
176 possibleca = float(len(compiledregex["ca"]))
177 possiblecg = float(len(compiledregex["cg"]))
178 possiblecm = float(len(compiledregex["cm"]))
179 cahits = currentIDHits["ca_hits"]
180 cghits = currentIDHits["cg_hits"]
181 cmhits = currentIDHits["cm_hits"]
182 if cahits > cghits and cahits > cmhits: #its a ca gene
183 if cahits <= int(chunksInCA * requiredChunkPercentage):
184 unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n")
185 continue 160 continue
186 ca += 1 161 linesplt = line.split("\t")
187 ca1hits = currentIDHits["ca1"] 162 if linesplt[2] == "No results":
188 ca2hits = currentIDHits["ca2"] 163 pass
189 cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 164 ID = linesplt[1]
190 if ca1hits > ca2hits: 165 currentIDHits = hits[ID]
191 #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" 166 possibleca = float(len(compiledregex["ca"]))
192 if ca1hits <= int(varsInCA * requiredVarPercentage): 167 possiblecg = float(len(compiledregex["cg"]))
193 unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") 168 possiblecm = float(len(compiledregex["cm"]))
194 continue 169 cahits = currentIDHits["ca_hits"]
195 ca1 += 1 170 cghits = currentIDHits["cg_hits"]
196 ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 171 cmhits = currentIDHits["cm_hits"]
197 else: 172 if cahits > cghits and cahits > cmhits: #its a ca gene
198 #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" 173 ca1hits = currentIDHits["ca1"]
199 if ca2hits <= int(varsInCA * requiredVarPercentage): 174 ca2hits = currentIDHits["ca2"]
200 unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") 175 if ca1hits > ca2hits:
201 continue 176 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
202 ca2 += 1 177 else:
203 ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 178 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
204 elif cghits > cahits and cghits > cmhits: #its a cg gene 179 elif cghits > cahits and cghits > cmhits: #its a cg gene
205 if cghits <= int(chunksInCG * requiredChunkPercentage): 180 cg1hits = currentIDHits["cg1"]
206 unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") 181 cg2hits = currentIDHits["cg2"]
207 continue 182 cg3hits = currentIDHits["cg3"]
208 cg += 1 183 cg4hits = currentIDHits["cg4"]
209 cg1hits = currentIDHits["cg1"] 184 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
210 cg2hits = currentIDHits["cg2"] 185 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
211 cg3hits = currentIDHits["cg3"] 186 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
212 cg4hits = currentIDHits["cg4"] 187 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
213 cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 188 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
214 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene 189 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
215 if cg1hits <= int(varsInCG * requiredVarPercentage): 190 else: #cg4 gene
216 unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") 191 o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
217 continue 192 else: #its a cm gene
218 cg1 += 1 193 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
219 cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
220 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
221 if cg2hits <= int(varsInCG * requiredVarPercentage):
222 unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n")
223 continue
224 cg2 += 1
225 cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
226 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
227 if cg3hits <= int(varsInCG * requiredVarPercentage):
228 unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n")
229 continue
230 cg3 += 1
231 cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
232 else: #cg4 gene
233 if cg4hits <= int(varsInCG * requiredVarPercentage):
234 unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n")
235 continue
236 cg4 += 1
237 cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
238 else: #its a cm gene
239 if cmhits <= int(chunksInCM * requiredChunkPercentage):
240 unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n")
241 continue
242 cm += 1
243 cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
244 finally:
245 cafile.close()
246 ca1file.close()
247 ca2file.close()
248 cgfile.close()
249 cg1file.close()
250 cg2file.close()
251 cg3file.close()
252 cg4file.close()
253 cmfile.close()
254 unmatchedfile.close()
255
256
257 #print ca,cg,cm,(ca+cg+cm)
258 194
259 print "Time: %i" % (int(time.time() * 1000) - starttime) 195 print "Time: %i" % (int(time.time() * 1000) - starttime)
260 196
261 197
262 198