diff gene_identification.py @ 4:069419cccba4 draft

Uploaded
author davidvanzessen
date Mon, 22 Sep 2014 10:19:36 -0400
parents 2f4298673519
children 71a12810eff3
line wrap: on
line diff
--- a/gene_identification.py	Wed Sep 17 07:25:17 2014 -0400
+++ b/gene_identification.py	Mon Sep 22 10:19:36 2014 -0400
@@ -5,31 +5,38 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")
-parser.add_argument("--outdir", help="Output directory, 7 output files will be written here")
+parser.add_argument("--output", help="The annotated summary output file")
 
 args = parser.parse_args()
 
 infile = args.input
 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"
-outdir = args.outdir
+output = args.output
 #outfile = "identified.txt"
 
 dic = dict()
 total = 0
 
+
 first = True
+IDIndex = 0
+seqIndex = 0
+
 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
-  for line in f:
-    total += 1
-    if first:
-      first = False
-      continue
-    linesplt = line.split("\t")
-    if linesplt[2] == "No results":
-      continue
-    ID = linesplt[1]
-    seq = linesplt[28]
-    dic[ID] = seq
+	for line in f:
+		total += 1
+		if first:
+			linesplt = line.split("\t")
+			IDIndex = linesplt.index("Sequence ID")
+			seqIndex = linesplt.index("Sequence")
+			first = False
+			continue
+		linesplt = line.split("\t")
+		ID = linesplt[IDIndex]
+		if len(linesplt) < 28: #weird rows without a sequence
+			dic[ID] = ""
+		else:
+			dic[ID] = linesplt[seqIndex]
 
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag",
@@ -128,7 +135,7 @@
 				continue
 			#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
 			currentIDHits[key + "_hits"] += 1
-		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1])
+		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1])
 		#start_location[ID + "_" + key] = str(start.index(max(start)))
 
 
@@ -141,120 +148,49 @@
 varsInCM = 0
 requiredVarPercentage = 0.7
 
-ca = 0
-ca1 = 0
-ca2 = 0
-cg = 0
-cg1 = 0
-cg2 = 0
-cg3 = 0
-cg4 = 0
-cm = 0
-try:
-	cafile = open(outdir + "/ca.txt", 'w')
-	ca1file = open(outdir + "/ca1.txt", 'w')
-	ca2file = open(outdir + "/ca2.txt", 'w')
-	cgfile = open(outdir + "/cg.txt", 'w')
-	cg1file = open(outdir + "/cg1.txt", 'w')
-	cg2file = open(outdir + "/cg2.txt", 'w')
-	cg3file = open(outdir + "/cg3.txt", 'w')
-	cg4file = open(outdir + "/cg4.txt", 'w')
-	cmfile = open(outdir + "/cm.txt", 'w')
-	unmatchedfile = open(outdir + "/unmatched.txt", 'w')
-	cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n")
-	for ID in hits.keys():
-		currentIDHits = hits[ID]
-		possibleca = float(len(compiledregex["ca"]))
-		possiblecg = float(len(compiledregex["cg"]))
-		possiblecm = float(len(compiledregex["cm"]))
-		cahits = currentIDHits["ca_hits"]
-		cghits = currentIDHits["cg_hits"]
-		cmhits = currentIDHits["cm_hits"]
-		if cahits > cghits and cahits > cmhits: #its a ca gene
-			if cahits <= int(chunksInCA * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n")
+
+first = True
+with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
+	with open(output, 'w') as o:
+		for line in f:
+			total += 1
+			if first:
+				o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+				first = False
 				continue
-			ca += 1
-			ca1hits = currentIDHits["ca1"]
-			ca2hits = currentIDHits["ca2"]
-			cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-			if ca1hits > ca2hits:
-				#print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit"
-				if ca1hits <= int(varsInCA * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
-					continue
-				ca1 += 1
-				ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-			else:
-				#print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit"
-				if ca2hits <= int(varsInCA * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
-					continue
-				ca2 += 1
-				ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-		elif cghits > cahits and cghits > cmhits: #its a cg gene
-			if cghits <= int(chunksInCG * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n")
-				continue
-			cg += 1
-			cg1hits = currentIDHits["cg1"]
-			cg2hits = currentIDHits["cg2"]
-			cg3hits = currentIDHits["cg3"]
-			cg4hits = currentIDHits["cg4"]
-			cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
-				if cg1hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n")
-					continue
-				cg1 += 1
-				cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
-				if cg2hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n")
-					continue
-				cg2 += 1
-				cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
-				if cg3hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n")
-					continue
-				cg3 += 1
-				cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			else: #cg4 gene
-				if cg4hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n")
-					continue
-				cg4 += 1
-				cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-		else: #its a cm gene
-			if cmhits <= int(chunksInCM * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n")
-				continue
-			cm += 1
-			cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
-finally:
-  cafile.close()
-  ca1file.close()
-  ca2file.close()
-  cgfile.close()
-  cg1file.close()
-  cg2file.close()
-  cg3file.close()
-  cg4file.close()
-  cmfile.close()
-  unmatchedfile.close()
-  
-
-#print ca,cg,cm,(ca+cg+cm)
+			linesplt = line.split("\t")
+			if linesplt[2] == "No results":
+				pass
+			ID = linesplt[1]
+			currentIDHits = hits[ID]
+			possibleca = float(len(compiledregex["ca"]))
+			possiblecg = float(len(compiledregex["cg"]))
+			possiblecm = float(len(compiledregex["cm"]))
+			cahits = currentIDHits["ca_hits"]
+			cghits = currentIDHits["cg_hits"]
+			cmhits = currentIDHits["cm_hits"]
+			if cahits > cghits and cahits > cmhits: #its a ca gene
+				ca1hits = currentIDHits["ca1"]
+				ca2hits = currentIDHits["ca2"]
+				if ca1hits > ca2hits:
+					o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+				else:
+					o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			elif cghits > cahits and cghits > cmhits: #its a cg gene
+				cg1hits = currentIDHits["cg1"]
+				cg2hits = currentIDHits["cg2"]
+				cg3hits = currentIDHits["cg3"]
+				cg4hits = currentIDHits["cg4"]
+				if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
+					o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
+					o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				else: #cg4 gene
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			else: #its a cm gene
+				o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
 
 print "Time: %i" % (int(time.time() * 1000) - starttime)