Mercurial > repos > davidvanzessen > mutation_analysis

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.r	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,13 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input = args[1]
+filterf = args[2]
+output = args[3]
+
+
+dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
+filterdat = read.table(filterf, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+dat = dat[dat$Sequence.ID %in% filterdat$ID,]
+
+write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_identification.py	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,265 @@
+import re
+import argparse
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")
+parser.add_argument("--outdir", help="Output directory, 7 output files will be written here")
+
+args = parser.parse_args()
+
+infile = args.input
+#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"
+outdir = args.outdir
+#outfile = "identified.txt"
+
+dic = dict()
+total = 0
+
+first = True
+with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
+  for line in f:
+    total += 1
+    if first:
+      first = False
+      continue
+    linesplt = line.split("\t")
+    if linesplt[2] == "No results":
+      continue
+    ID = linesplt[1]
+    seq = linesplt[28]
+    dic[ID] = seq
+
+#lambda/kappa reference sequence
+searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag",
+                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc",
+                 "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"}
+
+compiledregex = {"ca": [],
+                 "cg": [],
+                 "cm": []}
+
+#lambda/kappa reference sequence variable nucleotides
+ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'}
+ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'}
+cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
+cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'}
+cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
+cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'}
+
+#reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap
+chunklength = 8
+
+#create the chunks of the reference sequence with regular expressions for the variable nucleotides
+for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):
+  pos = i
+  chunk = searchstrings["ca"][i:i+chunklength]
+  result = ""
+  varsInResult = 0
+  for c in chunk:
+    if pos in ca1.keys():
+      varsInResult += 1
+      result += "[" + ca1[pos] + ca2[pos] + "]"
+    else:
+      result += c
+    pos += 1
+  compiledregex["ca"].append((re.compile(result), varsInResult))
+
+for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):
+  pos = i
+  chunk = searchstrings["cg"][i:i+chunklength]
+  result = ""
+  varsInResult = 0
+  for c in chunk:
+    if pos in cg1.keys():
+      varsInResult += 1
+      result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"
+    else:
+      result += c
+    pos += 1
+  compiledregex["cg"].append((re.compile(result), varsInResult))
+
+for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):
+  compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))
+
+
+
+def removeAndReturnMaxIndex(x): #simplifies a list comprehension
+  m = max(x)
+  index = x.index(m)
+  x[index] = 0
+  return index
+
+
+start_location = dict()
+hits = dict()
+alltotal = 0
+for key in compiledregex.keys(): #for ca/cg/cm
+	regularexpressions = compiledregex[key] #get the compiled regular expressions
+	for ID in dic.keys()[0:]: #for every ID
+		if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists
+			hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0}
+		currentIDHits = hits[ID]
+		seq = dic[ID]
+		lastindex = 0
+		start = [0] * len(seq)
+		for i, regexp in enumerate(regularexpressions): #for every regular expression
+			regex, hasVar = regexp
+			matches = regex.finditer(seq[lastindex:])
+			for match in matches: #for every match with the current regex, only uses the first hit
+				lastindex += match.start()
+				start[lastindex - chunklength / 2 * i] += 1
+				if hasVar: #if the regex has a variable nt in it
+					chunkstart = chunklength / 2 * i #where in the reference does this chunk start
+					chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end
+					if key == "ca": #just calculate the variable nt score for 'ca', cheaper
+						currentIDHits["ca1"] += len([1 for x in ca1 if chunkstart <= x < chunkend and ca1[x] == seq[lastindex + x - chunkstart]])
+						currentIDHits["ca2"] += len([1 for x in ca2 if chunkstart <= x < chunkend and ca2[x] == seq[lastindex + x - chunkstart]])
+					elif key == "cg": #just calculate the variable nt score for 'cg', cheaper
+						currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]])
+						currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]])
+						currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]])
+						currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])
+					else: #key == "cm" #no variable regions in 'cm'
+						pass
+				break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped
+			else: #only runs if there were no hits
+				continue
+			#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
+			currentIDHits[key + "_hits"] += 1
+		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1])
+		#start_location[ID + "_" + key] = str(start.index(max(start)))
+
+
+chunksInCA = len(compiledregex["ca"])
+chunksInCG = len(compiledregex["cg"])
+chunksInCM = len(compiledregex["cm"])
+requiredChunkPercentage = 0.7
+varsInCA = float(len(ca1.keys()) * 2)
+varsInCG = float(len(cg1.keys()) * 2) + 1
+varsInCM = 0
+requiredVarPercentage = 0.7
+
+ca = 0
+ca1 = 0
+ca2 = 0
+cg = 0
+cg1 = 0
+cg2 = 0
+cg3 = 0
+cg4 = 0
+cm = 0
+try:
+	cafile = open(outdir + "/ca.txt", 'w')
+	ca1file = open(outdir + "/ca1.txt", 'w')
+	ca2file = open(outdir + "/ca2.txt", 'w')
+	cgfile = open(outdir + "/cg.txt", 'w')
+	cg1file = open(outdir + "/cg1.txt", 'w')
+	cg2file = open(outdir + "/cg2.txt", 'w')
+	cg3file = open(outdir + "/cg3.txt", 'w')
+	cg4file = open(outdir + "/cg4.txt", 'w')
+	cmfile = open(outdir + "/cm.txt", 'w')
+	unmatchedfile = open(outdir + "/unmatched.txt", 'w')
+	cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+	unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n")
+	for ID in hits.keys():
+		currentIDHits = hits[ID]
+		possibleca = float(len(compiledregex["ca"]))
+		possiblecg = float(len(compiledregex["cg"]))
+		possiblecm = float(len(compiledregex["cm"]))
+		cahits = currentIDHits["ca_hits"]
+		cghits = currentIDHits["cg_hits"]
+		cmhits = currentIDHits["cm_hits"]
+		if cahits > cghits and cahits > cmhits: #its a ca gene
+			if cahits <= int(chunksInCA * requiredChunkPercentage):
+				unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n")
+				continue
+			ca += 1
+			ca1hits = currentIDHits["ca1"]
+			ca2hits = currentIDHits["ca2"]
+			cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			if ca1hits > ca2hits:
+				#print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit"
+				if ca1hits <= int(varsInCA * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
+					continue
+				ca1 += 1
+				ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			else:
+				#print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit"
+				if ca2hits <= int(varsInCA * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
+					continue
+				ca2 += 1
+				ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+		elif cghits > cahits and cghits > cmhits: #its a cg gene
+			if cghits <= int(chunksInCG * requiredChunkPercentage):
+				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n")
+				continue
+			cg += 1
+			cg1hits = currentIDHits["cg1"]
+			cg2hits = currentIDHits["cg2"]
+			cg3hits = currentIDHits["cg3"]
+			cg4hits = currentIDHits["cg4"]
+			cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
+				if cg1hits <= int(varsInCG * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n")
+					continue
+				cg1 += 1
+				cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
+				if cg2hits <= int(varsInCG * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n")
+					continue
+				cg2 += 1
+				cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
+				if cg3hits <= int(varsInCG * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n")
+					continue
+				cg3 += 1
+				cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			else: #cg4 gene
+				if cg4hits <= int(varsInCG * requiredVarPercentage):
+					unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n")
+					continue
+				cg4 += 1
+				cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+		else: #its a cm gene
+			if cmhits <= int(chunksInCM * requiredChunkPercentage):
+				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n")
+				continue
+			cm += 1
+			cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
+finally:
+  cafile.close()
+  ca1file.close()
+  ca2file.close()
+  cgfile.close()
+  cg1file.close()
+  cg2file.close()
+  cg3file.close()
+  cg4file.close()
+  cmfile.close()
+  unmatchedfile.close()
+
+
+#print ca,cg,cm,(ca+cg+cm)
+
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.py	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,80 @@
+import re
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--mutationfile", help="The '7_V-REGION-mutation-and-AA-change-table' file from the IMGT output")
+parser.add_argument("--hotspotfile", help="The '10_V-REGION-mutation-hotspots' file from the IMGT output")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+mutationfile = args.mutationfile #"test_VH-Ca_Cg_25nt/7_V-REGION-mutation-and-AA-change-table_test_VH-Ca_Cg_25nt_241013.txt"
+hotspotsfile = args.hotspotfile #"test_VH-Ca_Cg_25nt/10_V-REGION-mutation-hotspots_test_VH-Ca_Cg_25nt_241013.txt"
+outfile = args.output #"out.txt"
+
+mutationdic = dict()
+mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
+linecount = 0
+
+with open(mutationfile, 'r') as i:
+  for line in i.readlines()[1:]:
+		linecount += 1
+		linesplt = line.split("\t")
+		if linesplt[2] != "productive":
+			continue
+		ID = linesplt[1]
+		mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[5].split("|") if x]
+		mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[6].split("|") if x]
+		mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[7].split("|") if x]
+		mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[8].split("|") if x]
+		mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
+		mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[9].split("|") if x]
+
+if linecount == 0:
+	print "No data, exiting"
+	with open(outfile, 'w') as o:
+		o.write("RGYW (%),0,0,NA\n")
+		o.write("WRCY (%),0,0,NA\n")
+		o.write("WA (%),0,0,NA\n")
+		o.write("TW (%),0,0,NA\n")
+	import sys
+	sys.exit()
+
+hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
+RGYWCount = 0
+WRCYCount = 0
+WACount = 0
+TWCount = 0
+
+with open(hotspotsfile, 'r') as i:
+	for line in i.readlines()[1:]:
+		linesplt = line.split("\t")
+		if linesplt[2] != "productive":
+			continue
+		ID = linesplt[1]
+		RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[6].split("|") if x]]
+		WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[7].split("|") if x]]
+		WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[4].split("|") if x]]
+		TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[5].split("|") if x]]
+		RGYWCount += sum([1 for (x,y,z) in RGYW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		WRCYCount += sum([1 for (x,y,z) in WRCY if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		WACount += sum([1 for (x,y,z) in WA if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		TWCount += sum([1 for (x,y,z) in TW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+
+path = outfile[:outfile.rfind("/") + 1] + "mutations.txt"
+value = 0;
+with open(path, 'r') as f:
+	value = f.readlines()[0].split(",")[1]
+with open(outfile, 'w') as o:
+	if value != "0":
+		o.write("RGYW (%)," + str(RGYWCount) + "," + value + "," + str(round(RGYWCount / float(value) * 100, 1)) + "\n")
+		o.write("WRCY (%)," + str(WRCYCount) + "," + value + "," + str(round(WRCYCount / float(value) * 100, 1)) + "\n")
+		o.write("WA (%)," + str(WACount) + "," + value + "," + str(round(WACount / float(value) * 100, 1)) + "\n")
+		o.write("TW (%)," + str(TWCount) + "," + value + "," + str(round(TWCount / float(value) * 100, 1)) + "\n")
+	else:
+		o.write("RGYW (%)," + str(RGYWCount) + ",0,NA\n")
+		o.write("WRCY (%)," + str(WRCYCount) + ",0,NA\n")
+		o.write("WA (%)," + str(WACount) + ",0,NA\n")
+		o.write("TW (%)," + str(TWCount) + ",0,NA\n")
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.r	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,328 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input = args[1]
+summaryinput = args[2]
+outputdir = args[3]
+setwd(outputdir)
+
+#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
+dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
+datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")]
+
+dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
+
+#dat = dat[dat$Functionality == "productive",]
+
+dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
+dat$VGene = gsub("[*].*", "", dat$VGene)
+
+dat$past = paste(dat$AA.JUNCTION, dat$VGene)
+
+#dat = dat[!duplicated(dat$past), ]
+
+if(length(dat$Sequence.ID) == 0){
+	setwd(outputdir)
+	result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
+	row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+	write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+	transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
+	row.names(transitionTable) = c("A", "C", "G", "T")
+	transitionTable["A","A"] = NA
+	transitionTable["C","C"] = NA
+	transitionTable["G","G"] = NA
+	transitionTable["T","T"] = NA
+	write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+	cat("0", file="n.txt")
+	stop("No data")
+}
+
+#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
+
+result = data.frame(x = 1:5, y = 1:5, z = 1:5)
+row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+
+cleanup_columns = c("FR1.IMGT.c.a",
+                    "FR2.IMGT.g.t",
+                    "CDR1.IMGT.Nb.of.nucleotides",
+                    "CDR2.IMGT.t.a",
+                    "FR1.IMGT.c.g",
+                    "CDR1.IMGT.c.t",
+                    "FR2.IMGT.a.c",
+                    "FR2.IMGT.Nb.of.mutations",
+                    "FR2.IMGT.g.c",
+                    "FR2.IMGT.a.g",
+                    "FR3.IMGT.t.a",
+                    "FR3.IMGT.t.c",
+                    "FR2.IMGT.g.a",
+                    "FR3.IMGT.c.g",
+                    "FR1.IMGT.Nb.of.mutations",
+                    "CDR1.IMGT.g.a",
+                    "CDR1.IMGT.t.g",
+                    "CDR1.IMGT.g.c",
+                    "CDR2.IMGT.Nb.of.nucleotides",
+                    "FR2.IMGT.a.t",
+                    "CDR1.IMGT.Nb.of.mutations",
+                    "CDR1.IMGT.a.g",
+                    "FR3.IMGT.a.c",
+                    "FR1.IMGT.g.a",
+                    "FR3.IMGT.a.g",
+                    "FR1.IMGT.a.t",
+                    "CDR2.IMGT.a.g",
+                    "CDR2.IMGT.Nb.of.mutations",
+                    "CDR2.IMGT.g.t",
+                    "CDR2.IMGT.a.c",
+                    "CDR1.IMGT.t.c",
+                    "FR3.IMGT.g.c",
+                    "FR1.IMGT.g.t",
+                    "FR3.IMGT.g.t",
+                    "CDR1.IMGT.a.t",
+                    "FR1.IMGT.a.g",
+                    "FR3.IMGT.a.t",
+                    "FR3.IMGT.Nb.of.nucleotides",
+                    "FR2.IMGT.t.c",
+                    "CDR2.IMGT.g.a",
+                    "FR2.IMGT.t.a",
+                    "CDR1.IMGT.t.a",
+                    "FR2.IMGT.t.g",
+                    "FR3.IMGT.t.g",
+                    "FR2.IMGT.Nb.of.nucleotides",
+                    "FR1.IMGT.t.a",
+                    "FR1.IMGT.t.g",
+                    "FR3.IMGT.c.t",
+                    "FR1.IMGT.t.c",
+                    "CDR2.IMGT.a.t",
+                    "FR2.IMGT.c.t",
+                    "CDR1.IMGT.g.t",
+                    "CDR2.IMGT.t.g",
+                    "FR1.IMGT.Nb.of.nucleotides",
+                    "CDR1.IMGT.c.g",
+                    "CDR2.IMGT.t.c",
+                    "FR3.IMGT.g.a",
+                    "CDR1.IMGT.a.c",
+                    "FR2.IMGT.c.a",
+                    "FR3.IMGT.Nb.of.mutations",
+                    "FR2.IMGT.c.g",
+                    "CDR2.IMGT.g.c",
+                    "FR1.IMGT.g.c",
+                    "CDR2.IMGT.c.t",
+                    "FR3.IMGT.c.a",
+                    "CDR1.IMGT.c.a",
+                    "CDR2.IMGT.c.g",
+                    "CDR2.IMGT.c.a",
+                    "FR1.IMGT.c.t")
+
+for(col in cleanup_columns){
+  dat[,col] = gsub("\\(.*\\)", "", dat[,col])
+  #dat[dat[,col] == "",] = "0"
+  dat[,col] = as.numeric(dat[,col])
+  dat[is.na(dat[,col]),] = 0
+}
+
+VRegionMutations =  sum(dat$FR1.IMGT.Nb.of.mutations +
+                        dat$CDR1.IMGT.Nb.of.mutations +
+                        dat$FR2.IMGT.Nb.of.mutations +
+                        dat$CDR2.IMGT.Nb.of.mutations +
+                        dat$FR3.IMGT.Nb.of.mutations)
+
+VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
+                          dat$CDR1.IMGT.Nb.of.nucleotides +
+                          dat$FR2.IMGT.Nb.of.nucleotides +
+                          dat$CDR2.IMGT.Nb.of.nucleotides +
+                          dat$FR3.IMGT.Nb.of.nucleotides)
+
+result[1,"x"] = VRegionMutations
+result[1,"y"] = VRegionNucleotides
+result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
+
+transitionMutations = sum(dat$FR1.IMGT.a.g +
+                          dat$FR1.IMGT.g.a +
+                          dat$FR1.IMGT.c.t +
+                          dat$FR1.IMGT.t.c +
+                          dat$CDR1.IMGT.a.g +
+                          dat$CDR1.IMGT.g.a +
+                          dat$CDR1.IMGT.c.t +
+                          dat$CDR1.IMGT.t.c +
+                          dat$FR2.IMGT.a.g +
+                          dat$FR2.IMGT.g.a +
+                          dat$FR2.IMGT.c.t +
+                          dat$FR2.IMGT.t.c +
+                          dat$CDR2.IMGT.a.g +
+                          dat$CDR2.IMGT.g.a +
+                          dat$CDR2.IMGT.c.t +
+                          dat$CDR2.IMGT.t.c +
+                          dat$FR3.IMGT.a.g +
+                          dat$FR3.IMGT.g.a +
+                          dat$FR3.IMGT.c.t +
+                          dat$FR3.IMGT.t.c)
+
+result[2,"x"] = transitionMutations
+result[2,"y"] = VRegionMutations
+result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
+
+transversionMutations = sum(  dat$FR1.IMGT.a.c +
+                              dat$FR1.IMGT.c.a +
+                              dat$FR1.IMGT.a.t +
+                              dat$FR1.IMGT.t.a +
+                              dat$FR1.IMGT.g.c +
+                              dat$FR1.IMGT.c.g +
+                              dat$FR1.IMGT.g.t +
+                              dat$FR1.IMGT.t.g +
+                              dat$CDR1.IMGT.a.c +
+                              dat$CDR1.IMGT.c.a +
+                              dat$CDR1.IMGT.a.t +
+                              dat$CDR1.IMGT.t.a +
+                              dat$CDR1.IMGT.g.c +
+                              dat$CDR1.IMGT.c.g +
+                              dat$CDR1.IMGT.g.t +
+                              dat$CDR1.IMGT.t.g +
+                              dat$FR2.IMGT.a.c +
+                              dat$FR2.IMGT.c.a +
+                              dat$FR2.IMGT.a.t +
+                              dat$FR2.IMGT.t.a +
+                              dat$FR2.IMGT.g.c +
+                              dat$FR2.IMGT.c.g +
+                              dat$FR2.IMGT.g.t +
+                              dat$FR2.IMGT.t.g +
+                              dat$CDR2.IMGT.a.c +
+                              dat$CDR2.IMGT.c.a +
+                              dat$CDR2.IMGT.a.t +
+                              dat$CDR2.IMGT.t.a +
+                              dat$CDR2.IMGT.g.c +
+                              dat$CDR2.IMGT.c.g +
+                              dat$CDR2.IMGT.g.t +
+                              dat$CDR2.IMGT.t.g +
+                              dat$FR3.IMGT.a.c +
+                              dat$FR3.IMGT.c.a +
+                              dat$FR3.IMGT.a.t +
+                              dat$FR3.IMGT.t.a +
+                              dat$FR3.IMGT.g.c +
+                              dat$FR3.IMGT.c.g +
+                              dat$FR3.IMGT.g.t +
+                              dat$FR3.IMGT.t.g)
+
+result[3,"x"] = transversionMutations
+result[3,"y"] = VRegionMutations
+result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
+
+transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+                              dat$FR1.IMGT.c.t +
+                              dat$CDR1.IMGT.g.a +
+                              dat$CDR1.IMGT.c.t +
+                              dat$FR2.IMGT.g.a +
+                              dat$FR2.IMGT.c.t +
+                              dat$CDR2.IMGT.g.a +
+                              dat$CDR2.IMGT.c.t +
+                              dat$FR3.IMGT.g.a +
+                              dat$FR3.IMGT.c.t)
+
+totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+                         dat$FR1.IMGT.c.t +
+                         dat$FR1.IMGT.c.a +
+                         dat$FR1.IMGT.g.c +
+                         dat$FR1.IMGT.c.g +
+                         dat$FR1.IMGT.g.t +
+                         dat$CDR1.IMGT.g.a +
+                         dat$CDR1.IMGT.c.t +
+                         dat$CDR1.IMGT.c.a +
+                         dat$CDR1.IMGT.g.c +
+                         dat$CDR1.IMGT.c.g +
+                         dat$CDR1.IMGT.g.t +
+                         dat$FR2.IMGT.g.a +
+                         dat$FR2.IMGT.c.t +
+                         dat$FR2.IMGT.c.a +
+                         dat$FR2.IMGT.g.c +
+                         dat$FR2.IMGT.c.g +
+                         dat$FR2.IMGT.g.t +
+                         dat$CDR2.IMGT.g.a +
+                         dat$CDR2.IMGT.c.t +
+                         dat$CDR2.IMGT.c.a +
+                         dat$CDR2.IMGT.g.c +
+                         dat$CDR2.IMGT.c.g +
+                         dat$CDR2.IMGT.g.t +
+                         dat$FR3.IMGT.g.a +
+                         dat$FR3.IMGT.c.t +
+                         dat$FR3.IMGT.c.a +
+                         dat$FR3.IMGT.g.c +
+                         dat$FR3.IMGT.c.g +
+                         dat$FR3.IMGT.g.t)
+
+result[4,"x"] = transitionMutationsAtGC
+result[4,"y"] = totalMutationsAtGC
+result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
+
+result[5,"x"] = totalMutationsAtGC
+result[5,"y"] = VRegionMutations
+result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
+
+
+#transitiontable
+
+transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
+row.names(transitionTable) = c("A", "C", "G", "T")
+transitionTable["A","A"] = NA
+transitionTable["C","C"] = NA
+transitionTable["G","G"] = NA
+transitionTable["T","T"] = NA
+nts = c("a", "c", "g", "t")
+
+
+for(nt1 in nts){
+  for(nt2 in nts){
+    if(nt1 == nt2){
+      next
+    }
+    NT1 = LETTERS[letters == nt1]
+    NT2 = LETTERS[letters == nt2]
+    FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+    CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+    FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+    CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+    FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+    transitionTable[NT1,NT2] = sum( dat[,FR1] +
+                                    dat[,CDR1] +
+                                    dat[,FR2] +
+                                    dat[,CDR2] +
+                                    dat[,FR3])
+  }
+}
+
+setwd(outputdir)
+write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+cat(length(dat$Sequence.ID), file="n.txt")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.xml	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,15 @@
+<tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0">
+	<description></description>
+	<command interpreter="bash">
+		wrapper.sh $in_file $out_file $out_file.files_path
+	</command>
+	<inputs>
+		<param name="in_file" type="data" label="IMGT zip file to be analysed" />
+	</inputs>
+	<outputs>
+		<data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/>
+	</outputs>
+	<help>
+		there is no help
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/piechart.r	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,24 @@
+if (!("ggplot2" %in% rownames(installed.packages()))) {
+	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/")
+}
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+vls = args[1]
+lbls = args[2]
+name = args[3]
+output = args[4]
+
+vls = as.numeric(unlist(strsplit(vls, ",")))
+lbls = unlist(strsplit(lbls, ","))
+
+pc = ggplot(data.frame(Type=lbls, vls=vls), aes(x = factor(1), y=vls, fill=Type))
+pc = pc + geom_bar(width = 1, stat = "identity")
+pc = pc + coord_polar(theta="y")
+pc = pc + xlab(" ") + ylab(" ") + ggtitle(name)
+
+png(filename=output)
+#pie(vls, labels=lbls, col=rainbow(length(vls)), main=name)
+pc
+dev.off()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,121 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+input=$1
+output=$2
+outdir=$3
+mkdir $outdir
+
+unzip $input -d $PWD/files/ > $PWD/unziplog.log
+cat $PWD/files/*/1_* > $PWD/summary.txt
+cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
+cat $PWD/files/*/8_* > $PWD/mutationstats.txt
+cat $PWD/files/*/10_* > $PWD/hotspots.txt
+
+mkdir $outdir/identification/
+python $dir/gene_identification.py --input $PWD/summary.txt --outdir $outdir/identification/
+genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
+tmp=$PWD/tmp
+tmp2=$PWD/tmp2
+hotspottmp=$PWD/hotspottmp
+mutationtmp=$PWD/mutationtmp
+touch $outdir/mutationandhotspot.txt
+for gene in ${genes[@]}
+do
+	echo "Running $gene <br />" >> $output
+	mkdir $outdir/$gene
+	echo "Filtering input..." >> $output
+	Rscript $dir/filter.r $PWD/summary.txt $outdir/identification/${gene}.txt $outdir/$gene/summary.txt
+	Rscript $dir/filter.r $PWD/mutationanalysis.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationanalysis.txt
+	Rscript $dir/filter.r $PWD/mutationstats.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationstats.txt
+	Rscript $dir/filter.r $PWD/hotspots.txt $outdir/identification/${gene}.txt $outdir/$gene/hotspots.txt
+	echo "done <br />" >> $output
+
+	echo "Running R script on $gene..." >> $output
+	Rscript --verbose $dir/mutation_analysis.r $outdir/$gene/mutationstats.txt $outdir/$gene/summary.txt $outdir/$gene/ 2>&1
+	echo "done <br />" >> $output
+
+	echo "Running Python script..." >> $output
+	python $dir/mutation_analysis.py --mutationfile $outdir/$gene/mutationanalysis.txt --hotspotfile $outdir/$gene/hotspots.txt --output $outdir/$gene/hotspot_analysis.txt
+	echo "done <br />" >> $output
+	echo "Done with $gene <br />" >> $output
+	cut $outdir/$gene/mutations.txt -d, -f2,3,4 > $mutationtmp
+	cut $outdir/$gene/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp
+	cat $mutationtmp $hotspottmp > $tmp
+	paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2
+	cat $tmp2 > $outdir/mutationandhotspot.txt
+done
+
+Rscript --verbose $dir/mutation_analysis.r $PWD/mutationstats.txt $PWD/summary.txt $outdir/ 2>&1
+python $dir/mutation_analysis.py --mutationfile $PWD/mutationanalysis.txt --hotspotfile $PWD/hotspots.txt --output $outdir/hotspot_analysis.txt
+
+cut $outdir/mutations.txt -d, -f2,3,4 > $mutationtmp
+cut $outdir/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp
+cat $mutationtmp $hotspottmp > $tmp
+paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2
+cat $tmp2 > $outdir/mutationandhotspot.txt
+
+cut $outdir/ca1/mutations.txt -d, -f1 > $mutationtmp
+cut $outdir/ca1/hotspot_analysis.txt -d, -f1 > $hotspottmp
+cat $mutationtmp $hotspottmp > $tmp
+paste $tmp $outdir/mutationandhotspot.txt -d, > $tmp2
+cat $tmp2 | tr -s "," > $outdir/mutationandhotspot.txt
+
+ca_n=`cat $outdir/ca/n.txt`
+ca1_n=`cat $outdir/ca1/n.txt`
+ca2_n=`cat $outdir/ca2/n.txt`
+cg_n=`cat $outdir/cg/n.txt`
+cg1_n=`cat $outdir/cg1/n.txt`
+cg2_n=`cat $outdir/cg2/n.txt`
+cg3_n=`cat $outdir/cg3/n.txt`
+cg4_n=`cat $outdir/cg4/n.txt`
+cm_n=`cat $outdir/cm/n.txt`
+#all_n=$((ca_n + cg_n + cm_n))
+all_n=`cat $outdir/n.txt`
+
+
+echo "<html><table border='1'>" > $output
+echo "<tr><th>info</th>" >> $output
+echo "<th><a href='identification/ca.txt'>ca (N = $ca_n)</a></th>" >> $output
+echo "<th><a href='identification/ca1.txt'>ca1 (N = $ca1_n)</a></th>" >> $output
+echo "<th><a href='identification/ca2.txt'>ca2 (N = $ca2_n)</a></th>" >> $output
+echo "<th><a href='identification/cg.txt'>cg (N = $cg_n)</a></th>" >> $output
+echo "<th><a href='identification/cg1.txt'>cg1 (N = $cg1_n)</a></th>" >> $output
+echo "<th><a href='identification/cg2.txt'>cg2 (N = $cg2_n)</a></th>" >> $output
+echo "<th><a href='identification/cg3.txt'>cg3 (N = $cg3_n)</a></th>" >> $output
+echo "<th><a href='identification/cg4.txt'>cg4 (N = $cg4_n)</a></th>" >> $output
+echo "<th><a href='identification/cm.txt'>cm (N = $cm_n)</a></th>" >> $output
+echo "<th>all (N = $all_n)</th></tr>" >> $output
+while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
+	do
+		echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
+done < $outdir/mutationandhotspot.txt
+echo "</table>" >> $output
+echo "<a href='identification/unmatched.txt'>umatched</a><br />" >> $output
+
+Rscript $dir/piechart.r "${ca_n},${cg_n},${cm_n}" "IgA - ${ca_n},IgG - ${cg_n},IgM? - ${cm_n}" "Ig* (N = $all_n)" $outdir/all.png 2>&1
+Rscript $dir/piechart.r "${ca1_n},${ca2_n}" "IgA1 - ${ca1_n},IgA2 - ${ca2_n}" "IgA (N = $ca_n)" $outdir/ca.png 2>&1
+Rscript $dir/piechart.r "${cg1_n},${cg2_n},${cg3_n},${cg4_n}" "IgG1 - ${cg1_n},IgG2 - ${cg2_n},IgG3 - ${cg3_n},IgG4 - ${cg4_n}" "IgG (N = $cg_n)" $outdir/cg.png 2>&1
+
+echo "<img src='all.png'/>" >> $output
+echo "<img src='ca.png'/>" >> $output
+echo "<img src='cg.png'/>" >> $output
+
+for gene in ${genes[@]}
+do
+	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
+	while IFS=, read from a c g t
+		do
+			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+	done < $outdir/$gene/transitions.txt
+	echo "</table>" >> $output
+done
+
+echo "<table border='1'><caption>All transition table</caption>" >> $output
+while IFS=, read from a c g t
+	do
+		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+done < $outdir/transitions.txt
+echo "</table>" >> $output
+
+echo "</html>" >> $output