Mercurial > repos > davidvanzessen > mutation_analysis
changeset 0:74d2bc479bee draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 18 Aug 2014 04:04:37 -0400 |
parents | |
children | 856b5b718d21 |
files | filter.r gene_identification.py mutation_analysis.py mutation_analysis.r mutation_analysis.xml piechart.r wrapper.sh |
diffstat | 7 files changed, 846 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,13 @@ +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +filterf = args[2] +output = args[3] + + +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) +filterdat = read.table(filterf, header=T, sep="\t", fill=T, stringsAsFactors=F) + +dat = dat[dat$Sequence.ID %in% filterdat$ID,] + +write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_identification.py Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,265 @@ +import re +import argparse + + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") +parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") + +args = parser.parse_args() + +infile = args.input +#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" +outdir = args.outdir +#outfile = "identified.txt" + +dic = dict() +total = 0 + +first = True +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + for line in f: + total += 1 + if first: + first = False + continue + linesplt = line.split("\t") + if linesplt[2] == "No results": + continue + ID = linesplt[1] + seq = linesplt[28] + dic[ID] = seq + +#lambda/kappa reference sequence +searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", + "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} + +compiledregex = {"ca": [], + "cg": [], + "cm": []} + +#lambda/kappa reference sequence variable nucleotides +ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'} +ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} +cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} +cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} + +#reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap +chunklength = 8 + +#create the chunks of the reference sequence with regular expressions for the variable nucleotides +for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["ca"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in ca1.keys(): + varsInResult += 1 + result += "[" + ca1[pos] + ca2[pos] + "]" + else: + result += c + pos += 1 + compiledregex["ca"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["cg"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in cg1.keys(): + varsInResult += 1 + result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]" + else: + result += c + pos += 1 + compiledregex["cg"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): + compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) + + + +def removeAndReturnMaxIndex(x): #simplifies a list comprehension + m = max(x) + index = x.index(m) + x[index] = 0 + return index + + +start_location = dict() +hits = dict() +alltotal = 0 +for key in compiledregex.keys(): #for ca/cg/cm + regularexpressions = compiledregex[key] #get the compiled regular expressions + for ID in dic.keys()[0:]: #for every ID + if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists + hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} + currentIDHits = hits[ID] + seq = dic[ID] + lastindex = 0 + start = [0] * len(seq) + for i, regexp in enumerate(regularexpressions): #for every regular expression + regex, hasVar = regexp + matches = regex.finditer(seq[lastindex:]) + for match in matches: #for every match with the current regex, only uses the first hit + lastindex += match.start() + start[lastindex - chunklength / 2 * i] += 1 + if hasVar: #if the regex has a variable nt in it + chunkstart = chunklength / 2 * i #where in the reference does this chunk start + chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end + if key == "ca": #just calculate the variable nt score for 'ca', cheaper + currentIDHits["ca1"] += len([1 for x in ca1 if chunkstart <= x < chunkend and ca1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["ca2"] += len([1 for x in ca2 if chunkstart <= x < chunkend and ca2[x] == seq[lastindex + x - chunkstart]]) + elif key == "cg": #just calculate the variable nt score for 'cg', cheaper + currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) + else: #key == "cm" #no variable regions in 'cm' + pass + break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped + else: #only runs if there were no hits + continue + #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] + currentIDHits[key + "_hits"] += 1 + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) + #start_location[ID + "_" + key] = str(start.index(max(start))) + + +chunksInCA = len(compiledregex["ca"]) +chunksInCG = len(compiledregex["cg"]) +chunksInCM = len(compiledregex["cm"]) +requiredChunkPercentage = 0.7 +varsInCA = float(len(ca1.keys()) * 2) +varsInCG = float(len(cg1.keys()) * 2) + 1 +varsInCM = 0 +requiredVarPercentage = 0.7 + +ca = 0 +ca1 = 0 +ca2 = 0 +cg = 0 +cg1 = 0 +cg2 = 0 +cg3 = 0 +cg4 = 0 +cm = 0 +try: + cafile = open(outdir + "/ca.txt", 'w') + ca1file = open(outdir + "/ca1.txt", 'w') + ca2file = open(outdir + "/ca2.txt", 'w') + cgfile = open(outdir + "/cg.txt", 'w') + cg1file = open(outdir + "/cg1.txt", 'w') + cg2file = open(outdir + "/cg2.txt", 'w') + cg3file = open(outdir + "/cg3.txt", 'w') + cg4file = open(outdir + "/cg4.txt", 'w') + cmfile = open(outdir + "/cm.txt", 'w') + unmatchedfile = open(outdir + "/unmatched.txt", 'w') + cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") + for ID in hits.keys(): + currentIDHits = hits[ID] + possibleca = float(len(compiledregex["ca"])) + possiblecg = float(len(compiledregex["cg"])) + possiblecm = float(len(compiledregex["cm"])) + cahits = currentIDHits["ca_hits"] + cghits = currentIDHits["cg_hits"] + cmhits = currentIDHits["cm_hits"] + if cahits > cghits and cahits > cmhits: #its a ca gene + if cahits <= int(chunksInCA * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") + continue + ca += 1 + ca1hits = currentIDHits["ca1"] + ca2hits = currentIDHits["ca2"] + cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + if ca1hits > ca2hits: + #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" + if ca1hits <= int(varsInCA * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") + continue + ca1 += 1 + ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + else: + #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" + if ca2hits <= int(varsInCA * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") + continue + ca2 += 1 + ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + elif cghits > cahits and cghits > cmhits: #its a cg gene + if cghits <= int(chunksInCG * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") + continue + cg += 1 + cg1hits = currentIDHits["cg1"] + cg2hits = currentIDHits["cg2"] + cg3hits = currentIDHits["cg3"] + cg4hits = currentIDHits["cg4"] + cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene + if cg1hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") + continue + cg1 += 1 + cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene + if cg2hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") + continue + cg2 += 1 + cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene + if cg3hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") + continue + cg3 += 1 + cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #cg4 gene + if cg4hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") + continue + cg4 += 1 + cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm gene + if cmhits <= int(chunksInCM * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") + continue + cm += 1 + cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") +finally: + cafile.close() + ca1file.close() + ca2file.close() + cgfile.close() + cg1file.close() + cg2file.close() + cg3file.close() + cg4file.close() + cmfile.close() + unmatchedfile.close() + + +#print ca,cg,cm,(ca+cg+cm) + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.py Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,80 @@ +import re +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--mutationfile", help="The '7_V-REGION-mutation-and-AA-change-table' file from the IMGT output") +parser.add_argument("--hotspotfile", help="The '10_V-REGION-mutation-hotspots' file from the IMGT output") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +mutationfile = args.mutationfile #"test_VH-Ca_Cg_25nt/7_V-REGION-mutation-and-AA-change-table_test_VH-Ca_Cg_25nt_241013.txt" +hotspotsfile = args.hotspotfile #"test_VH-Ca_Cg_25nt/10_V-REGION-mutation-hotspots_test_VH-Ca_Cg_25nt_241013.txt" +outfile = args.output #"out.txt" + +mutationdic = dict() +mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") +linecount = 0 + +with open(mutationfile, 'r') as i: + for line in i.readlines()[1:]: + linecount += 1 + linesplt = line.split("\t") + if linesplt[2] != "productive": + continue + ID = linesplt[1] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[5].split("|") if x] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[6].split("|") if x] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[7].split("|") if x] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[8].split("|") if x] + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[9].split("|") if x] + +if linecount == 0: + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%),0,0,NA\n") + o.write("WRCY (%),0,0,NA\n") + o.write("WA (%),0,0,NA\n") + o.write("TW (%),0,0,NA\n") + import sys + sys.exit() + +hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") +RGYWCount = 0 +WRCYCount = 0 +WACount = 0 +TWCount = 0 + +with open(hotspotsfile, 'r') as i: + for line in i.readlines()[1:]: + linesplt = line.split("\t") + if linesplt[2] != "productive": + continue + ID = linesplt[1] + RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[6].split("|") if x]] + WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[7].split("|") if x]] + WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[4].split("|") if x]] + TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[5].split("|") if x]] + RGYWCount += sum([1 for (x,y,z) in RGYW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WRCYCount += sum([1 for (x,y,z) in WRCY if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WACount += sum([1 for (x,y,z) in WA if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + TWCount += sum([1 for (x,y,z) in TW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + +path = outfile[:outfile.rfind("/") + 1] + "mutations.txt" +value = 0; +with open(path, 'r') as f: + value = f.readlines()[0].split(",")[1] +with open(outfile, 'w') as o: + if value != "0": + o.write("RGYW (%)," + str(RGYWCount) + "," + value + "," + str(round(RGYWCount / float(value) * 100, 1)) + "\n") + o.write("WRCY (%)," + str(WRCYCount) + "," + value + "," + str(round(WRCYCount / float(value) * 100, 1)) + "\n") + o.write("WA (%)," + str(WACount) + "," + value + "," + str(round(WACount / float(value) * 100, 1)) + "\n") + o.write("TW (%)," + str(TWCount) + "," + value + "," + str(round(TWCount / float(value) * 100, 1)) + "\n") + else: + o.write("RGYW (%)," + str(RGYWCount) + ",0,NA\n") + o.write("WRCY (%)," + str(WRCYCount) + ",0,NA\n") + o.write("WA (%)," + str(WACount) + ",0,NA\n") + o.write("TW (%)," + str(TWCount) + ",0,NA\n") + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,328 @@ +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +summaryinput = args[2] +outputdir = args[3] +setwd(outputdir) + +#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) +datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")] + +dat = merge(dat, datSum, by="Sequence.ID", all.x=T) + +#dat = dat[dat$Functionality == "productive",] + +dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) +dat$VGene = gsub("[*].*", "", dat$VGene) + +dat$past = paste(dat$AA.JUNCTION, dat$VGene) + +#dat = dat[!duplicated(dat$past), ] + +if(length(dat$Sequence.ID) == 0){ + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") +} + +#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) + +result = data.frame(x = 1:5, y = 1:5, z = 1:5) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + +cleanup_columns = c("FR1.IMGT.c.a", + "FR2.IMGT.g.t", + "CDR1.IMGT.Nb.of.nucleotides", + "CDR2.IMGT.t.a", + "FR1.IMGT.c.g", + "CDR1.IMGT.c.t", + "FR2.IMGT.a.c", + "FR2.IMGT.Nb.of.mutations", + "FR2.IMGT.g.c", + "FR2.IMGT.a.g", + "FR3.IMGT.t.a", + "FR3.IMGT.t.c", + "FR2.IMGT.g.a", + "FR3.IMGT.c.g", + "FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.g.a", + "CDR1.IMGT.t.g", + "CDR1.IMGT.g.c", + "CDR2.IMGT.Nb.of.nucleotides", + "FR2.IMGT.a.t", + "CDR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.a.g", + "FR3.IMGT.a.c", + "FR1.IMGT.g.a", + "FR3.IMGT.a.g", + "FR1.IMGT.a.t", + "CDR2.IMGT.a.g", + "CDR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.g.t", + "CDR2.IMGT.a.c", + "CDR1.IMGT.t.c", + "FR3.IMGT.g.c", + "FR1.IMGT.g.t", + "FR3.IMGT.g.t", + "CDR1.IMGT.a.t", + "FR1.IMGT.a.g", + "FR3.IMGT.a.t", + "FR3.IMGT.Nb.of.nucleotides", + "FR2.IMGT.t.c", + "CDR2.IMGT.g.a", + "FR2.IMGT.t.a", + "CDR1.IMGT.t.a", + "FR2.IMGT.t.g", + "FR3.IMGT.t.g", + "FR2.IMGT.Nb.of.nucleotides", + "FR1.IMGT.t.a", + "FR1.IMGT.t.g", + "FR3.IMGT.c.t", + "FR1.IMGT.t.c", + "CDR2.IMGT.a.t", + "FR2.IMGT.c.t", + "CDR1.IMGT.g.t", + "CDR2.IMGT.t.g", + "FR1.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.c.g", + "CDR2.IMGT.t.c", + "FR3.IMGT.g.a", + "CDR1.IMGT.a.c", + "FR2.IMGT.c.a", + "FR3.IMGT.Nb.of.mutations", + "FR2.IMGT.c.g", + "CDR2.IMGT.g.c", + "FR1.IMGT.g.c", + "CDR2.IMGT.c.t", + "FR3.IMGT.c.a", + "CDR1.IMGT.c.a", + "CDR2.IMGT.c.g", + "CDR2.IMGT.c.a", + "FR1.IMGT.c.t") + +for(col in cleanup_columns){ + dat[,col] = gsub("\\(.*\\)", "", dat[,col]) + #dat[dat[,col] == "",] = "0" + dat[,col] = as.numeric(dat[,col]) + dat[is.na(dat[,col]),] = 0 +} + +VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + + dat$CDR1.IMGT.Nb.of.mutations + + dat$FR2.IMGT.Nb.of.mutations + + dat$CDR2.IMGT.Nb.of.mutations + + dat$FR3.IMGT.Nb.of.mutations) + +VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + + dat$CDR1.IMGT.Nb.of.nucleotides + + dat$FR2.IMGT.Nb.of.nucleotides + + dat$CDR2.IMGT.Nb.of.nucleotides + + dat$FR3.IMGT.Nb.of.nucleotides) + +result[1,"x"] = VRegionMutations +result[1,"y"] = VRegionNucleotides +result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) + +transitionMutations = sum(dat$FR1.IMGT.a.g + + dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.t.c + + dat$CDR1.IMGT.a.g + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.t.c + + dat$FR2.IMGT.a.g + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.t.c + + dat$CDR2.IMGT.a.g + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.t.c + + dat$FR3.IMGT.a.g + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.t.c) + +result[2,"x"] = transitionMutations +result[2,"y"] = VRegionMutations +result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) + +transversionMutations = sum( dat$FR1.IMGT.a.c + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.a.t + + dat$FR1.IMGT.t.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$FR1.IMGT.t.g + + dat$CDR1.IMGT.a.c + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.a.t + + dat$CDR1.IMGT.t.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$CDR1.IMGT.t.g + + dat$FR2.IMGT.a.c + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.a.t + + dat$FR2.IMGT.t.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$FR2.IMGT.t.g + + dat$CDR2.IMGT.a.c + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.a.t + + dat$CDR2.IMGT.t.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$CDR2.IMGT.t.g + + dat$FR3.IMGT.a.c + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.a.t + + dat$FR3.IMGT.t.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t + + dat$FR3.IMGT.t.g) + +result[3,"x"] = transversionMutations +result[3,"y"] = VRegionMutations +result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) + +transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t) + +totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t) + +result[4,"x"] = transitionMutationsAtGC +result[4,"y"] = totalMutationsAtGC +result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) + +result[5,"x"] = totalMutationsAtGC +result[5,"y"] = VRegionMutations +result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) + + +#transitiontable + +transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) +row.names(transitionTable) = c("A", "C", "G", "T") +transitionTable["A","A"] = NA +transitionTable["C","C"] = NA +transitionTable["G","G"] = NA +transitionTable["T","T"] = NA +nts = c("a", "c", "g", "t") + + +for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( dat[,FR1] + + dat[,CDR1] + + dat[,FR2] + + dat[,CDR2] + + dat[,FR3]) + } +} + +setwd(outputdir) +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +cat(length(dat$Sequence.ID), file="n.txt") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.xml Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,15 @@ +<tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0"> + <description></description> + <command interpreter="bash"> + wrapper.sh $in_file $out_file $out_file.files_path + </command> + <inputs> + <param name="in_file" type="data" label="IMGT zip file to be analysed" /> + </inputs> + <outputs> + <data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/> + </outputs> + <help> + there is no help + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piechart.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,24 @@ +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +vls = args[1] +lbls = args[2] +name = args[3] +output = args[4] + +vls = as.numeric(unlist(strsplit(vls, ","))) +lbls = unlist(strsplit(lbls, ",")) + +pc = ggplot(data.frame(Type=lbls, vls=vls), aes(x = factor(1), y=vls, fill=Type)) +pc = pc + geom_bar(width = 1, stat = "identity") +pc = pc + coord_polar(theta="y") +pc = pc + xlab(" ") + ylab(" ") + ggtitle(name) + +png(filename=output) +#pie(vls, labels=lbls, col=rainbow(length(vls)), main=name) +pc +dev.off()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,121 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +input=$1 +output=$2 +outdir=$3 +mkdir $outdir + +unzip $input -d $PWD/files/ > $PWD/unziplog.log +cat $PWD/files/*/1_* > $PWD/summary.txt +cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt +cat $PWD/files/*/8_* > $PWD/mutationstats.txt +cat $PWD/files/*/10_* > $PWD/hotspots.txt + +mkdir $outdir/identification/ +python $dir/gene_identification.py --input $PWD/summary.txt --outdir $outdir/identification/ +genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) +tmp=$PWD/tmp +tmp2=$PWD/tmp2 +hotspottmp=$PWD/hotspottmp +mutationtmp=$PWD/mutationtmp +touch $outdir/mutationandhotspot.txt +for gene in ${genes[@]} +do + echo "Running $gene <br />" >> $output + mkdir $outdir/$gene + echo "Filtering input..." >> $output + Rscript $dir/filter.r $PWD/summary.txt $outdir/identification/${gene}.txt $outdir/$gene/summary.txt + Rscript $dir/filter.r $PWD/mutationanalysis.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationanalysis.txt + Rscript $dir/filter.r $PWD/mutationstats.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationstats.txt + Rscript $dir/filter.r $PWD/hotspots.txt $outdir/identification/${gene}.txt $outdir/$gene/hotspots.txt + echo "done <br />" >> $output + + echo "Running R script on $gene..." >> $output + Rscript --verbose $dir/mutation_analysis.r $outdir/$gene/mutationstats.txt $outdir/$gene/summary.txt $outdir/$gene/ 2>&1 + echo "done <br />" >> $output + + echo "Running Python script..." >> $output + python $dir/mutation_analysis.py --mutationfile $outdir/$gene/mutationanalysis.txt --hotspotfile $outdir/$gene/hotspots.txt --output $outdir/$gene/hotspot_analysis.txt + echo "done <br />" >> $output + echo "Done with $gene <br />" >> $output + cut $outdir/$gene/mutations.txt -d, -f2,3,4 > $mutationtmp + cut $outdir/$gene/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp + cat $mutationtmp $hotspottmp > $tmp + paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 + cat $tmp2 > $outdir/mutationandhotspot.txt +done + +Rscript --verbose $dir/mutation_analysis.r $PWD/mutationstats.txt $PWD/summary.txt $outdir/ 2>&1 +python $dir/mutation_analysis.py --mutationfile $PWD/mutationanalysis.txt --hotspotfile $PWD/hotspots.txt --output $outdir/hotspot_analysis.txt + +cut $outdir/mutations.txt -d, -f2,3,4 > $mutationtmp +cut $outdir/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp +cat $mutationtmp $hotspottmp > $tmp +paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 +cat $tmp2 > $outdir/mutationandhotspot.txt + +cut $outdir/ca1/mutations.txt -d, -f1 > $mutationtmp +cut $outdir/ca1/hotspot_analysis.txt -d, -f1 > $hotspottmp +cat $mutationtmp $hotspottmp > $tmp +paste $tmp $outdir/mutationandhotspot.txt -d, > $tmp2 +cat $tmp2 | tr -s "," > $outdir/mutationandhotspot.txt + +ca_n=`cat $outdir/ca/n.txt` +ca1_n=`cat $outdir/ca1/n.txt` +ca2_n=`cat $outdir/ca2/n.txt` +cg_n=`cat $outdir/cg/n.txt` +cg1_n=`cat $outdir/cg1/n.txt` +cg2_n=`cat $outdir/cg2/n.txt` +cg3_n=`cat $outdir/cg3/n.txt` +cg4_n=`cat $outdir/cg4/n.txt` +cm_n=`cat $outdir/cm/n.txt` +#all_n=$((ca_n + cg_n + cm_n)) +all_n=`cat $outdir/n.txt` + + +echo "<html><table border='1'>" > $output +echo "<tr><th>info</th>" >> $output +echo "<th><a href='identification/ca.txt'>ca (N = $ca_n)</a></th>" >> $output +echo "<th><a href='identification/ca1.txt'>ca1 (N = $ca1_n)</a></th>" >> $output +echo "<th><a href='identification/ca2.txt'>ca2 (N = $ca2_n)</a></th>" >> $output +echo "<th><a href='identification/cg.txt'>cg (N = $cg_n)</a></th>" >> $output +echo "<th><a href='identification/cg1.txt'>cg1 (N = $cg1_n)</a></th>" >> $output +echo "<th><a href='identification/cg2.txt'>cg2 (N = $cg2_n)</a></th>" >> $output +echo "<th><a href='identification/cg3.txt'>cg3 (N = $cg3_n)</a></th>" >> $output +echo "<th><a href='identification/cg4.txt'>cg4 (N = $cg4_n)</a></th>" >> $output +echo "<th><a href='identification/cm.txt'>cm (N = $cm_n)</a></th>" >> $output +echo "<th>all (N = $all_n)</th></tr>" >> $output +while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz + do + echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output +done < $outdir/mutationandhotspot.txt +echo "</table>" >> $output +echo "<a href='identification/unmatched.txt'>umatched</a><br />" >> $output + +Rscript $dir/piechart.r "${ca_n},${cg_n},${cm_n}" "IgA - ${ca_n},IgG - ${cg_n},IgM? - ${cm_n}" "Ig* (N = $all_n)" $outdir/all.png 2>&1 +Rscript $dir/piechart.r "${ca1_n},${ca2_n}" "IgA1 - ${ca1_n},IgA2 - ${ca2_n}" "IgA (N = $ca_n)" $outdir/ca.png 2>&1 +Rscript $dir/piechart.r "${cg1_n},${cg2_n},${cg3_n},${cg4_n}" "IgG1 - ${cg1_n},IgG2 - ${cg2_n},IgG3 - ${cg3_n},IgG4 - ${cg4_n}" "IgG (N = $cg_n)" $outdir/cg.png 2>&1 + +echo "<img src='all.png'/>" >> $output +echo "<img src='ca.png'/>" >> $output +echo "<img src='cg.png'/>" >> $output + +for gene in ${genes[@]} +do + echo "<table border='1'><caption>$gene transition table</caption>" >> $output + while IFS=, read from a c g t + do + echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output + done < $outdir/$gene/transitions.txt + echo "</table>" >> $output +done + +echo "<table border='1'><caption>All transition table</caption>" >> $output +while IFS=, read from a c g t + do + echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output +done < $outdir/transitions.txt +echo "</table>" >> $output + +echo "</html>" >> $output