comparison gene_identification.py @ 10:4b83265b2686 draft

Uploaded
author davidvanzessen
date Thu, 26 Mar 2015 10:00:24 -0400
parents 3f4b4ef46c7f
children b0242cd1da34
comparison
equal deleted inserted replaced
9:4c4149fa0bcb 10:4b83265b2686
35 ID = linesplt[IDIndex] 35 ID = linesplt[IDIndex]
36 if len(linesplt) < 28: #weird rows without a sequence 36 if len(linesplt) < 28: #weird rows without a sequence
37 dic[ID] = "" 37 dic[ID] = ""
38 else: 38 else:
39 dic[ID] = linesplt[seqIndex] 39 dic[ID] = linesplt[seqIndex]
40
41 print "Number of input sequences:", len(dic)
40 42
41 #lambda/kappa reference sequence 43 #lambda/kappa reference sequence
42 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", 44 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctg",
43 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", 45 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgacca",
44 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} 46 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"}
45 47
46 compiledregex = {"ca": [], 48 compiledregex = {"ca": [],
47 "cg": [], 49 "cg": [],
48 "cm": []} 50 "cm": []}
117 break 119 break
118 regex, hasVar = regexp 120 regex, hasVar = regexp
119 matches = regex.finditer(seq[lastindex:]) 121 matches = regex.finditer(seq[lastindex:])
120 for match in matches: #for every match with the current regex, only uses the first hit 122 for match in matches: #for every match with the current regex, only uses the first hit
121 lastindex += match.start() 123 lastindex += match.start()
122 print ID
123 print lastindex
124 print chunklength
125 print i
126 print seq[lastindex:]
127 print start
128 print len(seq)
129 print relativeStartLocation
130 print "-------------------"
131 start[relativeStartLocation] += 1 124 start[relativeStartLocation] += 1
132 if hasVar: #if the regex has a variable nt in it 125 if hasVar: #if the regex has a variable nt in it
133 chunkstart = chunklength / 2 * i #where in the reference does this chunk start 126 chunkstart = chunklength / 2 * i #where in the reference does this chunk start
134 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end 127 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end
135 if key == "ca": #just calculate the variable nt score for 'ca', cheaper 128 if key == "ca": #just calculate the variable nt score for 'ca', cheaper
160 varsInCM = 0 153 varsInCM = 0
161 requiredVarPercentage = 0.7 154 requiredVarPercentage = 0.7
162 155
163 156
164 first = True 157 first = True
158 seq_write_count=0
165 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence 159 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
166 with open(output, 'w') as o: 160 with open(output, 'w') as o:
167 for line in f: 161 for line in f:
168 total += 1 162 total += 1
169 if first: 163 if first:
179 possiblecg = float(len(compiledregex["cg"])) 173 possiblecg = float(len(compiledregex["cg"]))
180 possiblecm = float(len(compiledregex["cm"])) 174 possiblecm = float(len(compiledregex["cm"]))
181 cahits = currentIDHits["ca_hits"] 175 cahits = currentIDHits["ca_hits"]
182 cghits = currentIDHits["cg_hits"] 176 cghits = currentIDHits["cg_hits"]
183 cmhits = currentIDHits["cm_hits"] 177 cmhits = currentIDHits["cm_hits"]
184 if cahits > cghits and cahits > cmhits: #its a ca gene 178 if cahits >= cghits and cahits >= cmhits: #its a ca gene
185 ca1hits = currentIDHits["ca1"] 179 ca1hits = currentIDHits["ca1"]
186 ca2hits = currentIDHits["ca2"] 180 ca2hits = currentIDHits["ca2"]
187 if ca1hits > ca2hits: 181 if ca1hits >= ca2hits:
188 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 182 o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
189 else: 183 else:
190 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 184 o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
191 elif cghits > cahits and cghits > cmhits: #its a cg gene 185 elif cghits >= cahits and cghits >= cmhits: #its a cg gene
192 cg1hits = currentIDHits["cg1"] 186 cg1hits = currentIDHits["cg1"]
193 cg2hits = currentIDHits["cg2"] 187 cg2hits = currentIDHits["cg2"]
194 cg3hits = currentIDHits["cg3"] 188 cg3hits = currentIDHits["cg3"]
195 cg4hits = currentIDHits["cg4"] 189 cg4hits = currentIDHits["cg4"]
196 if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene 190 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene
197 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 191 o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
198 elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene 192 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene
199 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 193 o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
200 elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene 194 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene
201 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 195 o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
202 else: #cg4 gene 196 else: #cg4 gene
203 o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 197 o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
204 else: #its a cm gene 198 else: #its a cm gene
205 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 199 o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
200 seq_write_count += 1
206 201
207 print "Time: %i" % (int(time.time() * 1000) - starttime) 202 print "Time: %i" % (int(time.time() * 1000) - starttime)
208 203
209 204 print "Number of sequences written to file:", seq_write_count
210 205
211 206
212 207
213 208
214 209