Mercurial > repos > davidvanzessen > mutation_analysis
comparison gene_identification.py @ 10:4b83265b2686 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 26 Mar 2015 10:00:24 -0400 |
parents | 3f4b4ef46c7f |
children | b0242cd1da34 |
comparison
equal
deleted
inserted
replaced
9:4c4149fa0bcb | 10:4b83265b2686 |
---|---|
35 ID = linesplt[IDIndex] | 35 ID = linesplt[IDIndex] |
36 if len(linesplt) < 28: #weird rows without a sequence | 36 if len(linesplt) < 28: #weird rows without a sequence |
37 dic[ID] = "" | 37 dic[ID] = "" |
38 else: | 38 else: |
39 dic[ID] = linesplt[seqIndex] | 39 dic[ID] = linesplt[seqIndex] |
40 | |
41 print "Number of input sequences:", len(dic) | |
40 | 42 |
41 #lambda/kappa reference sequence | 43 #lambda/kappa reference sequence |
42 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", | 44 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctg", |
43 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", | 45 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgacca", |
44 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} | 46 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} |
45 | 47 |
46 compiledregex = {"ca": [], | 48 compiledregex = {"ca": [], |
47 "cg": [], | 49 "cg": [], |
48 "cm": []} | 50 "cm": []} |
117 break | 119 break |
118 regex, hasVar = regexp | 120 regex, hasVar = regexp |
119 matches = regex.finditer(seq[lastindex:]) | 121 matches = regex.finditer(seq[lastindex:]) |
120 for match in matches: #for every match with the current regex, only uses the first hit | 122 for match in matches: #for every match with the current regex, only uses the first hit |
121 lastindex += match.start() | 123 lastindex += match.start() |
122 print ID | |
123 print lastindex | |
124 print chunklength | |
125 print i | |
126 print seq[lastindex:] | |
127 print start | |
128 print len(seq) | |
129 print relativeStartLocation | |
130 print "-------------------" | |
131 start[relativeStartLocation] += 1 | 124 start[relativeStartLocation] += 1 |
132 if hasVar: #if the regex has a variable nt in it | 125 if hasVar: #if the regex has a variable nt in it |
133 chunkstart = chunklength / 2 * i #where in the reference does this chunk start | 126 chunkstart = chunklength / 2 * i #where in the reference does this chunk start |
134 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end | 127 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end |
135 if key == "ca": #just calculate the variable nt score for 'ca', cheaper | 128 if key == "ca": #just calculate the variable nt score for 'ca', cheaper |
160 varsInCM = 0 | 153 varsInCM = 0 |
161 requiredVarPercentage = 0.7 | 154 requiredVarPercentage = 0.7 |
162 | 155 |
163 | 156 |
164 first = True | 157 first = True |
158 seq_write_count=0 | |
165 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence | 159 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence |
166 with open(output, 'w') as o: | 160 with open(output, 'w') as o: |
167 for line in f: | 161 for line in f: |
168 total += 1 | 162 total += 1 |
169 if first: | 163 if first: |
179 possiblecg = float(len(compiledregex["cg"])) | 173 possiblecg = float(len(compiledregex["cg"])) |
180 possiblecm = float(len(compiledregex["cm"])) | 174 possiblecm = float(len(compiledregex["cm"])) |
181 cahits = currentIDHits["ca_hits"] | 175 cahits = currentIDHits["ca_hits"] |
182 cghits = currentIDHits["cg_hits"] | 176 cghits = currentIDHits["cg_hits"] |
183 cmhits = currentIDHits["cm_hits"] | 177 cmhits = currentIDHits["cm_hits"] |
184 if cahits > cghits and cahits > cmhits: #its a ca gene | 178 if cahits >= cghits and cahits >= cmhits: #its a ca gene |
185 ca1hits = currentIDHits["ca1"] | 179 ca1hits = currentIDHits["ca1"] |
186 ca2hits = currentIDHits["ca2"] | 180 ca2hits = currentIDHits["ca2"] |
187 if ca1hits > ca2hits: | 181 if ca1hits >= ca2hits: |
188 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 182 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
189 else: | 183 else: |
190 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 184 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
191 elif cghits > cahits and cghits > cmhits: #its a cg gene | 185 elif cghits >= cahits and cghits >= cmhits: #its a cg gene |
192 cg1hits = currentIDHits["cg1"] | 186 cg1hits = currentIDHits["cg1"] |
193 cg2hits = currentIDHits["cg2"] | 187 cg2hits = currentIDHits["cg2"] |
194 cg3hits = currentIDHits["cg3"] | 188 cg3hits = currentIDHits["cg3"] |
195 cg4hits = currentIDHits["cg4"] | 189 cg4hits = currentIDHits["cg4"] |
196 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene | 190 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene |
197 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 191 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
198 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene | 192 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene |
199 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 193 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
200 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene | 194 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene |
201 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 195 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
202 else: #cg4 gene | 196 else: #cg4 gene |
203 o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 197 o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
204 else: #its a cm gene | 198 else: #its a cm gene |
205 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 199 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
200 seq_write_count += 1 | |
206 | 201 |
207 print "Time: %i" % (int(time.time() * 1000) - starttime) | 202 print "Time: %i" % (int(time.time() * 1000) - starttime) |
208 | 203 |
209 | 204 print "Number of sequences written to file:", seq_write_count |
210 | 205 |
211 | 206 |
212 | 207 |
213 | 208 |
214 | 209 |