Mercurial > repos > davidvanzessen > mutation_analysis
comparison gene_identification.py @ 4:069419cccba4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 22 Sep 2014 10:19:36 -0400 |
parents | 2f4298673519 |
children | 71a12810eff3 |
comparison
equal
deleted
inserted
replaced
3:a0b27058dcac | 4:069419cccba4 |
---|---|
3 import time | 3 import time |
4 starttime= int(time.time() * 1000) | 4 starttime= int(time.time() * 1000) |
5 | 5 |
6 parser = argparse.ArgumentParser() | 6 parser = argparse.ArgumentParser() |
7 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") | 7 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") |
8 parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") | 8 parser.add_argument("--output", help="The annotated summary output file") |
9 | 9 |
10 args = parser.parse_args() | 10 args = parser.parse_args() |
11 | 11 |
12 infile = args.input | 12 infile = args.input |
13 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" | 13 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" |
14 outdir = args.outdir | 14 output = args.output |
15 #outfile = "identified.txt" | 15 #outfile = "identified.txt" |
16 | 16 |
17 dic = dict() | 17 dic = dict() |
18 total = 0 | 18 total = 0 |
19 | 19 |
20 | |
20 first = True | 21 first = True |
22 IDIndex = 0 | |
23 seqIndex = 0 | |
24 | |
21 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence | 25 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence |
22 for line in f: | 26 for line in f: |
23 total += 1 | 27 total += 1 |
24 if first: | 28 if first: |
25 first = False | 29 linesplt = line.split("\t") |
26 continue | 30 IDIndex = linesplt.index("Sequence ID") |
27 linesplt = line.split("\t") | 31 seqIndex = linesplt.index("Sequence") |
28 if linesplt[2] == "No results": | 32 first = False |
29 continue | 33 continue |
30 ID = linesplt[1] | 34 linesplt = line.split("\t") |
31 seq = linesplt[28] | 35 ID = linesplt[IDIndex] |
32 dic[ID] = seq | 36 if len(linesplt) < 28: #weird rows without a sequence |
37 dic[ID] = "" | |
38 else: | |
39 dic[ID] = linesplt[seqIndex] | |
33 | 40 |
34 #lambda/kappa reference sequence | 41 #lambda/kappa reference sequence |
35 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", | 42 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", |
36 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", | 43 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", |
37 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} | 44 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} |
126 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped | 133 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped |
127 else: #only runs if there were no hits | 134 else: #only runs if there were no hits |
128 continue | 135 continue |
129 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] | 136 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] |
130 currentIDHits[key + "_hits"] += 1 | 137 currentIDHits[key + "_hits"] += 1 |
131 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) | 138 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1]) |
132 #start_location[ID + "_" + key] = str(start.index(max(start))) | 139 #start_location[ID + "_" + key] = str(start.index(max(start))) |
133 | 140 |
134 | 141 |
135 chunksInCA = len(compiledregex["ca"]) | 142 chunksInCA = len(compiledregex["ca"]) |
136 chunksInCG = len(compiledregex["cg"]) | 143 chunksInCG = len(compiledregex["cg"]) |
139 varsInCA = float(len(ca1.keys()) * 2) | 146 varsInCA = float(len(ca1.keys()) * 2) |
140 varsInCG = float(len(cg1.keys()) * 2) + 1 | 147 varsInCG = float(len(cg1.keys()) * 2) + 1 |
141 varsInCM = 0 | 148 varsInCM = 0 |
142 requiredVarPercentage = 0.7 | 149 requiredVarPercentage = 0.7 |
143 | 150 |
144 ca = 0 | 151 |
145 ca1 = 0 | 152 first = True |
146 ca2 = 0 | 153 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence |
147 cg = 0 | 154 with open(output, 'w') as o: |
148 cg1 = 0 | 155 for line in f: |
149 cg2 = 0 | 156 total += 1 |
150 cg3 = 0 | 157 if first: |
151 cg4 = 0 | 158 o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") |
152 cm = 0 | 159 first = False |
153 try: | |
154 cafile = open(outdir + "/ca.txt", 'w') | |
155 ca1file = open(outdir + "/ca1.txt", 'w') | |
156 ca2file = open(outdir + "/ca2.txt", 'w') | |
157 cgfile = open(outdir + "/cg.txt", 'w') | |
158 cg1file = open(outdir + "/cg1.txt", 'w') | |
159 cg2file = open(outdir + "/cg2.txt", 'w') | |
160 cg3file = open(outdir + "/cg3.txt", 'w') | |
161 cg4file = open(outdir + "/cg4.txt", 'w') | |
162 cmfile = open(outdir + "/cm.txt", 'w') | |
163 unmatchedfile = open(outdir + "/unmatched.txt", 'w') | |
164 cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
165 ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
166 ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
167 cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
168 cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
169 cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
170 cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
171 cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
172 cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") | |
173 unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") | |
174 for ID in hits.keys(): | |
175 currentIDHits = hits[ID] | |
176 possibleca = float(len(compiledregex["ca"])) | |
177 possiblecg = float(len(compiledregex["cg"])) | |
178 possiblecm = float(len(compiledregex["cm"])) | |
179 cahits = currentIDHits["ca_hits"] | |
180 cghits = currentIDHits["cg_hits"] | |
181 cmhits = currentIDHits["cm_hits"] | |
182 if cahits > cghits and cahits > cmhits: #its a ca gene | |
183 if cahits <= int(chunksInCA * requiredChunkPercentage): | |
184 unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") | |
185 continue | 160 continue |
186 ca += 1 | 161 linesplt = line.split("\t") |
187 ca1hits = currentIDHits["ca1"] | 162 if linesplt[2] == "No results": |
188 ca2hits = currentIDHits["ca2"] | 163 pass |
189 cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 164 ID = linesplt[1] |
190 if ca1hits > ca2hits: | 165 currentIDHits = hits[ID] |
191 #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" | 166 possibleca = float(len(compiledregex["ca"])) |
192 if ca1hits <= int(varsInCA * requiredVarPercentage): | 167 possiblecg = float(len(compiledregex["cg"])) |
193 unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") | 168 possiblecm = float(len(compiledregex["cm"])) |
194 continue | 169 cahits = currentIDHits["ca_hits"] |
195 ca1 += 1 | 170 cghits = currentIDHits["cg_hits"] |
196 ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 171 cmhits = currentIDHits["cm_hits"] |
197 else: | 172 if cahits > cghits and cahits > cmhits: #its a ca gene |
198 #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" | 173 ca1hits = currentIDHits["ca1"] |
199 if ca2hits <= int(varsInCA * requiredVarPercentage): | 174 ca2hits = currentIDHits["ca2"] |
200 unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") | 175 if ca1hits > ca2hits: |
201 continue | 176 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
202 ca2 += 1 | 177 else: |
203 ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 178 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
204 elif cghits > cahits and cghits > cmhits: #its a cg gene | 179 elif cghits > cahits and cghits > cmhits: #its a cg gene |
205 if cghits <= int(chunksInCG * requiredChunkPercentage): | 180 cg1hits = currentIDHits["cg1"] |
206 unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") | 181 cg2hits = currentIDHits["cg2"] |
207 continue | 182 cg3hits = currentIDHits["cg3"] |
208 cg += 1 | 183 cg4hits = currentIDHits["cg4"] |
209 cg1hits = currentIDHits["cg1"] | 184 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene |
210 cg2hits = currentIDHits["cg2"] | 185 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
211 cg3hits = currentIDHits["cg3"] | 186 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene |
212 cg4hits = currentIDHits["cg4"] | 187 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
213 cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 188 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene |
214 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene | 189 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
215 if cg1hits <= int(varsInCG * requiredVarPercentage): | 190 else: #cg4 gene |
216 unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") | 191 o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
217 continue | 192 else: #its a cm gene |
218 cg1 += 1 | 193 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
219 cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | |
220 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene | |
221 if cg2hits <= int(varsInCG * requiredVarPercentage): | |
222 unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") | |
223 continue | |
224 cg2 += 1 | |
225 cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | |
226 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene | |
227 if cg3hits <= int(varsInCG * requiredVarPercentage): | |
228 unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") | |
229 continue | |
230 cg3 += 1 | |
231 cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | |
232 else: #cg4 gene | |
233 if cg4hits <= int(varsInCG * requiredVarPercentage): | |
234 unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") | |
235 continue | |
236 cg4 += 1 | |
237 cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | |
238 else: #its a cm gene | |
239 if cmhits <= int(chunksInCM * requiredChunkPercentage): | |
240 unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") | |
241 continue | |
242 cm += 1 | |
243 cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") | |
244 finally: | |
245 cafile.close() | |
246 ca1file.close() | |
247 ca2file.close() | |
248 cgfile.close() | |
249 cg1file.close() | |
250 cg2file.close() | |
251 cg3file.close() | |
252 cg4file.close() | |
253 cmfile.close() | |
254 unmatchedfile.close() | |
255 | |
256 | |
257 #print ca,cg,cm,(ca+cg+cm) | |
258 | 194 |
259 print "Time: %i" % (int(time.time() * 1000) - starttime) | 195 print "Time: %i" % (int(time.time() * 1000) - starttime) |
260 | 196 |
261 | 197 |
262 | 198 |