mutation_analysis: gene_identification.py comparison

comparison gene_identification.py @ 4:069419cccba4 draft

Uploaded

author	davidvanzessen
date	Mon, 22 Sep 2014 10:19:36 -0400
parents	2f4298673519
children	71a12810eff3

comparison

equal deleted inserted replaced

-:a0b27058dcac
+:069419cccba4
 import time
 starttime= int(time.time() * 1000)
 parser = argparse.ArgumentParser()
 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")
-parser.add_argument("--outdir", help="Output directory, 7 output files will be written here")
+parser.add_argument("--output", help="The annotated summary output file")
 args = parser.parse_args()
 infile = args.input
 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"
-outdir = args.outdir
+output = args.output
 #outfile = "identified.txt"
 dic = dict()
 total = 0
 first = True
+IDIndex = 0
+seqIndex = 0
 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
-for line in f:
+	for line in f:
-total += 1
+		total += 1
-if first:
+		if first:
-first = False
+			linesplt = line.split("\t")
-continue
+			IDIndex = linesplt.index("Sequence ID")
-linesplt = line.split("\t")
+			seqIndex = linesplt.index("Sequence")
-if linesplt[2] == "No results":
+			first = False
-continue
+			continue
-ID = linesplt[1]
+		linesplt = line.split("\t")
-seq = linesplt[28]
+		ID = linesplt[IDIndex]
-dic[ID] = seq
+		if len(linesplt) < 28: #weird rows without a sequence
+			dic[ID] = ""
+		else:
+			dic[ID] = linesplt[seqIndex]
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag",
 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc",
 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"}
 				break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped
 			else: #only runs if there were no hits
 				continue
 			#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
 			currentIDHits[key + "_hits"] += 1
-		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1])
+		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1])
 		#start_location[ID + "_" + key] = str(start.index(max(start)))
 chunksInCA = len(compiledregex["ca"])
 chunksInCG = len(compiledregex["cg"])
 varsInCA = float(len(ca1.keys()) * 2)
 varsInCG = float(len(cg1.keys()) * 2) + 1
 varsInCM = 0
 requiredVarPercentage = 0.7
-ca = 0
-ca1 = 0
+first = True
-ca2 = 0
+with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
-cg = 0
+	with open(output, 'w') as o:
-cg1 = 0
+		for line in f:
-cg2 = 0
+			total += 1
-cg3 = 0
+			if first:
-cg4 = 0
+				o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-cm = 0
+				first = False
-try:
-	cafile = open(outdir + "/ca.txt", 'w')
-	ca1file = open(outdir + "/ca1.txt", 'w')
-	ca2file = open(outdir + "/ca2.txt", 'w')
-	cgfile = open(outdir + "/cg.txt", 'w')
-	cg1file = open(outdir + "/cg1.txt", 'w')
-	cg2file = open(outdir + "/cg2.txt", 'w')
-	cg3file = open(outdir + "/cg3.txt", 'w')
-	cg4file = open(outdir + "/cg4.txt", 'w')
-	cmfile = open(outdir + "/cm.txt", 'w')
-	unmatchedfile = open(outdir + "/unmatched.txt", 'w')
-	cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n")
-	for ID in hits.keys():
-		currentIDHits = hits[ID]
-		possibleca = float(len(compiledregex["ca"]))
-		possiblecg = float(len(compiledregex["cg"]))
-		possiblecm = float(len(compiledregex["cm"]))
-		cahits = currentIDHits["ca_hits"]
-		cghits = currentIDHits["cg_hits"]
-		cmhits = currentIDHits["cm_hits"]
-		if cahits > cghits and cahits > cmhits: #its a ca gene
-			if cahits <= int(chunksInCA * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n")
 				continue
-			ca += 1
+			linesplt = line.split("\t")
-			ca1hits = currentIDHits["ca1"]
+			if linesplt[2] == "No results":
-			ca2hits = currentIDHits["ca2"]
+				pass
-			cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			ID = linesplt[1]
-			if ca1hits > ca2hits:
+			currentIDHits = hits[ID]
-				#print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit"
+			possibleca = float(len(compiledregex["ca"]))
-				if ca1hits <= int(varsInCA * requiredVarPercentage):
+			possiblecg = float(len(compiledregex["cg"]))
-					unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
+			possiblecm = float(len(compiledregex["cm"]))
-					continue
+			cahits = currentIDHits["ca_hits"]
-				ca1 += 1
+			cghits = currentIDHits["cg_hits"]
-				ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			cmhits = currentIDHits["cm_hits"]
-			else:
+			if cahits > cghits and cahits > cmhits: #its a ca gene
-				#print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit"
+				ca1hits = currentIDHits["ca1"]
-				if ca2hits <= int(varsInCA * requiredVarPercentage):
+				ca2hits = currentIDHits["ca2"]
-					unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
+				if ca1hits > ca2hits:
-					continue
+					o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-				ca2 += 1
+				else:
-				ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+					o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-		elif cghits > cahits and cghits > cmhits: #its a cg gene
+			elif cghits > cahits and cghits > cmhits: #its a cg gene
-			if cghits <= int(chunksInCG * requiredChunkPercentage):
+				cg1hits = currentIDHits["cg1"]
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n")
+				cg2hits = currentIDHits["cg2"]
-				continue
+				cg3hits = currentIDHits["cg3"]
-			cg += 1
+				cg4hits = currentIDHits["cg4"]
-			cg1hits = currentIDHits["cg1"]
+				if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
-			cg2hits = currentIDHits["cg2"]
+					o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			cg3hits = currentIDHits["cg3"]
+				elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
-			cg4hits = currentIDHits["cg4"]
+					o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
-			if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-				if cg1hits <= int(varsInCG * requiredVarPercentage):
+				else: #cg4 gene
-					unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n")
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-					continue
+			else: #its a cm gene
-				cg1 += 1
+				o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-				cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
-				if cg2hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n")
-					continue
-				cg2 += 1
-				cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
-				if cg3hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n")
-					continue
-				cg3 += 1
-				cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			else: #cg4 gene
-				if cg4hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n")
-					continue
-				cg4 += 1
-				cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-		else: #its a cm gene
-			if cmhits <= int(chunksInCM * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n")
-				continue
-			cm += 1
-			cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
-finally:
-cafile.close()
-ca1file.close()
-ca2file.close()
-cgfile.close()
-cg1file.close()
-cg2file.close()
-cg3file.close()
-cg4file.close()
-cmfile.close()
-unmatchedfile.close()
-#print ca,cg,cm,(ca+cg+cm)
 print "Time: %i" % (int(time.time() * 1000) - starttime)

Mercurial > repos > davidvanzessen > mutation_analysis

comparison gene_identification.py @ 4:069419cccba4 draft