# HG changeset patch # User davidvanzessen # Date 1456760979 18000 # Node ID 7290a88ea202b13eb87e7865671de8f6fa3cec54 # Parent d3542f87a30444a86a0afe036009627d9999d0b8 Uploaded diff -r d3542f87a304 -r 7290a88ea202 mutation_analysis.py --- a/mutation_analysis.py Fri Jan 29 08:11:31 2016 -0500 +++ b/mutation_analysis.py Mon Feb 29 10:49:39 2016 -0500 @@ -1,3 +1,4 @@ +from __future__ import division from collections import defaultdict import re import argparse @@ -56,8 +57,7 @@ linesplt = line.split("\t") ID = linesplt[IDIndex] genedic[ID] = linesplt[best_matchIndex] - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if - x] if include_fr1 else [] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] @@ -209,35 +209,68 @@ WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + +def mean(lst): + return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 + + +def median(lst): + lst = sorted(lst) + l = len(lst) + if l == 0: + return 0 + if l == 1: + return lst[0] + + l = int(l / 2) + + if len(lst) % 2 == 0: + print "list length is", l + return float(lst[l] + lst[(l - 1)]) / 2.0 + else: + return lst[l] + +funcs = {"mean": mean, "median": median, "sum": sum} + directory = outfile[:outfile.rfind("/") + 1] value = 0 valuedic = dict() -for gene in genes: - with open(directory + gene + "_value.txt", 'r') as v: - valuedic[gene] = int(v.readlines()[0].rstrip()) -with open(directory + "total_value.txt", 'r') as v: - valuedic["total"] = int(v.readlines()[0].rstrip()) + +for fname in funcs.keys(): + for gene in genes: + with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: + valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) + with open(directory + "all_" + fname + "_value.txt", 'r') as v: + valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) + +print valuedic + +def get_xyz(lst, gene, f, fname): + x = int(round(f(lst))) + y = valuedic[gene + "_" + fname] + z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0" + return (str(x), str(y), z) dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} arr = ["RGYW", "WRCY", "WA", "TW"] -with open(outfile, 'w') as o: - for typ in arr: - o.write(typ + " (%)") - curr = dic[typ] - for gene in genes: - geneMatcher = re.compile(".*" + gene + ".*") - if valuedic[gene] is 0: - o.write(",0,0,0") - else: - x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) - y = valuedic[gene] - z = str(round(x / float(valuedic[gene]) * 100, 1)) - o.write("," + str(x) + "," + str(y) + "," + z) - # for total - x = int(round(sum([y for x, y in curr.iteritems()]))) - y = valuedic["total"] - z = str(round(x / float(valuedic["total"]) * 100, 1)) - o.write("," + str(x) + "," + str(y) + "," + z + "\n") + +for fname in funcs.keys(): + func = funcs[fname] + foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" + with open(foutfile, 'w') as o: + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = re.compile(".*" + gene + ".*") + if valuedic[gene + "_" + fname] is 0: + o.write(",0,0,0") + else: + x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) + o.write("," + x + "," + y + "," + z) + # for total + x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) + o.write("," + x + "," + y + "," + z + "\n") # for testing @@ -245,5 +278,4 @@ with open(seq_motif_file, 'w') as o: o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") for ID in IDlist: - o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( - round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") + o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") diff -r d3542f87a304 -r 7290a88ea202 mutation_analysis.py.bak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.py.bak Mon Feb 29 10:49:39 2016 -0500 @@ -0,0 +1,248 @@ +from collections import defaultdict +import re +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", + help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") +parser.add_argument("--genes", help="The genes available in the 'best_match' column") +parser.add_argument("--includefr1", help="The genes available in the 'best_match' column") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +infile = args.input +genes = str(args.genes).split(",") +print "includefr1 =", args.includefr1 +include_fr1 = True if args.includefr1 == "yes" else False +outfile = args.output + +genedic = dict() + +mutationdic = dict() +mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") +linecount = 0 + +IDIndex = 0 +best_matchIndex = 0 +fr1Index = 0 +cdr1Index = 0 +fr2Index = 0 +cdr2Index = 0 +fr3Index = 0 +first = True +IDlist = [] +mutationList = [] +mutationListByID = {} +cdr1LengthDic = {} +cdr2LengthDic = {} + +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence.ID") + best_matchIndex = linesplt.index("best_match") + fr1Index = linesplt.index("FR1.IMGT") + cdr1Index = linesplt.index("CDR1.IMGT") + fr2Index = linesplt.index("FR2.IMGT") + cdr2Index = linesplt.index("CDR2.IMGT") + fr3Index = linesplt.index("FR3.IMGT") + cdr1LengthIndex = linesplt.index("CDR1.IMGT.Nb.of.nucleotides") + cdr2LengthIndex = linesplt.index("CDR2.IMGT.Nb.of.nucleotides") + first = False + continue + linecount += 1 + linesplt = line.split("\t") + ID = linesplt[IDIndex] + genedic[ID] = linesplt[best_matchIndex] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] + + mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + + cdr1Length = linesplt[cdr1LengthIndex] + cdr2Length = linesplt[cdr2LengthIndex] + + cdr1LengthDic[ID] = int(cdr1Length) / 3 + cdr2LengthDic[ID] = int(cdr2Length) / 3 + + IDlist += [ID] + +AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent + +AA_mutation = [0] * AALength +AA_mutation_empty = AA_mutation[:] + +aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" +with open(aa_mutations_by_id_file, 'w') as o: + for ID in mutationListByID.keys(): + AA_mutation_for_ID = AA_mutation_empty[:] + for mutation in mutationListByID[ID]: + if mutation[4]: + AA_mutation[int(mutation[4])] += 1 + AA_mutation_for_ID[int(mutation[4])] += 1 + o.write(ID + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") + + + +#absent AA stuff +absentAACDR1Dic = defaultdict(list) +absentAACDR1Dic[5] = range(29,36) +absentAACDR1Dic[6] = range(29,35) +absentAACDR1Dic[7] = range(30,35) +absentAACDR1Dic[8] = range(30,34) +absentAACDR1Dic[9] = range(31,34) +absentAACDR1Dic[10] = range(31,33) +absentAACDR1Dic[11] = [32] + +absentAACDR2Dic = defaultdict(list) +absentAACDR2Dic[0] = range(55,65) +absentAACDR2Dic[1] = range(56,65) +absentAACDR2Dic[2] = range(56,64) +absentAACDR2Dic[3] = range(57,64) +absentAACDR2Dic[4] = range(57,63) +absentAACDR2Dic[5] = range(58,63) +absentAACDR2Dic[6] = range(58,62) +absentAACDR2Dic[7] = range(59,62) +absentAACDR2Dic[8] = range(59,61) +absentAACDR2Dic[9] = [60] + +absentAA = [len(IDlist)] * (AALength-1) +for k, cdr1Length in cdr1LengthDic.iteritems(): + for c in absentAACDR1Dic[cdr1Length]: + absentAA[c] -= 1 + +for k, cdr2Length in cdr2LengthDic.iteritems(): + for c in absentAACDR2Dic[cdr2Length]: + absentAA[c] -= 1 + + +aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" +with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tcdr1length\tcdr2length\t" + "\t".join([str(x) for x in range(1,AALength-1)]) + "\n") + for ID in IDlist: + absentAAbyID = [1] * (AALength-1) + cdr1Length = cdr1LengthDic[ID] + for c in absentAACDR1Dic[cdr1Length]: + absentAAbyID[c] -= 1 + + cdr2Length = cdr2LengthDic[ID] + for c in absentAACDR2Dic[cdr2Length]: + absentAAbyID[c] -= 1 + o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") + + + +aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt" +with open(aa_mutations_file, 'w') as o: + o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") + o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation[1:]]) + "\n") + o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") + +if linecount == 0: + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) + o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) + o.write("WA (%)," + ("0,0,0\n" * len(genes))) + o.write("TW (%)," + ("0,0,0\n" * len(genes))) + import sys + + sys.exit() + +hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") +RGYWCount = {g: 0 for g in genes} +WRCYCount = {g: 0 for g in genes} +WACount = {g: 0 for g in genes} +TWCount = {g: 0 for g in genes} + +IDIndex = 0 +ataIndex = 0 +tatIndex = 0 +aggctatIndex = 0 +atagcctIndex = 0 +first = True +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + ataIndex = linesplt.index("X.a.t.a") + tatIndex = linesplt.index("t.a.t.") + aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") + atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") + first = False + continue + linesplt = line.split("\t") + gene = linesplt[best_matchIndex] + ID = linesplt[IDIndex] + RGYW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] + WRCY = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] + WA = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] + TW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] + RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 + + mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ + ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) + mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) + mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) + mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) + + in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) + + if in_how_many_motifs > 0: + RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs + WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs + WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs + TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + +directory = outfile[:outfile.rfind("/") + 1] +value = 0 +valuedic = dict() +for gene in genes: + with open(directory + gene + "_value.txt", 'r') as v: + valuedic[gene] = int(v.readlines()[0].rstrip()) +with open(directory + "total_value.txt", 'r') as v: + valuedic["total"] = int(v.readlines()[0].rstrip()) + +dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} +arr = ["RGYW", "WRCY", "WA", "TW"] +with open(outfile, 'w') as o: + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = re.compile(".*" + gene + ".*") + if valuedic[gene] is 0: + o.write(",0,0,0") + else: + x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) + y = valuedic[gene] + z = str(round(x / float(valuedic[gene]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z) + # for total + x = int(round(sum([y for x, y in curr.iteritems()]))) + y = valuedic["total"] + z = str(round(x / float(valuedic["total"]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z + "\n") + + +# for testing +seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" +with open(seq_motif_file, 'w') as o: + o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") + for ID in IDlist: + o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( + round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") diff -r d3542f87a304 -r 7290a88ea202 mutation_analysis.r --- a/mutation_analysis.r Fri Jan 29 08:11:31 2016 -0500 +++ b/mutation_analysis.r Mon Feb 29 10:49:39 2016 -0500 @@ -170,55 +170,53 @@ setwd(outputdir) -nts = c("a", "c", "g", "t") -zeros=rep(0, 4) -matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9) -for(i in 1:length(genes)){ - gene = genes[i] + +calculate_result = function(i, gene, dat, matrx, f, fname, name){ tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] - if(gene == "."){ - tmp = dat - } + j = i - 1 x = (j * 3) + 1 y = (j * 3) + 2 z = (j * 3) + 3 - matrx[1,x] = sum(tmp$VRegionMutations) - matrx[1,y] = sum(tmp$VRegionNucleotides) - matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) - - matrx[2,x] = sum(tmp$transitionMutations) - matrx[2,y] = sum(tmp$VRegionMutations) - matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) - - matrx[3,x] = sum(tmp$transversionMutations) - matrx[3,y] = sum(tmp$VRegionMutations) - matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) - - matrx[4,x] = sum(tmp$transitionMutationsAtGC) - matrx[4,y] = sum(tmp$totalMutationsAtGC) - matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) - - matrx[5,x] = sum(tmp$totalMutationsAtGC) - matrx[5,y] = sum(tmp$VRegionMutations) - matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) - - matrx[6,x] = sum(tmp$transitionMutationsAtAT) - matrx[6,y] = sum(tmp$totalMutationsAtAT) - matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) - - matrx[7,x] = sum(tmp$totalMutationsAtAT) - matrx[7,y] = sum(tmp$VRegionMutations) - matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) - - matrx[8,x] = sum(tmp$nonSilentMutationsFR) - matrx[8,y] = sum(tmp$silentMutationsFR) - matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) - - matrx[9,x] = sum(tmp$nonSilentMutationsCDR) - matrx[9,y] = sum(tmp$silentMutationsCDR) - matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) - + + if(nrow(tmp) > 0){ + + matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) + matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + + matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1) + matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + + matrx[3,x] = round(f(tmp$transversionMutations, na.rm=T), digits=1) + matrx[3,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + + matrx[4,x] = round(f(tmp$transitionMutationsAtGC, na.rm=T), digits=1) + matrx[4,y] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1) + matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + + matrx[5,x] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1) + matrx[5,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + + matrx[6,x] = round(f(tmp$transitionMutationsAtAT, na.rm=T), digits=1) + matrx[6,y] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1) + matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + + matrx[7,x] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1) + matrx[7,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + + matrx[8,x] = round(f(tmp$nonSilentMutationsFR, na.rm=T), digits=1) + matrx[8,y] = round(f(tmp$silentMutationsFR, na.rm=T), digits=1) + matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + + matrx[9,x] = round(f(tmp$nonSilentMutationsCDR, na.rm=T), digits=1) + matrx[9,y] = round(f(tmp$silentMutationsCDR, na.rm=T), digits=1) + matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) + } transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) row.names(transitionTable) = c("A", "C", "G", "T") @@ -250,93 +248,40 @@ } - write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) - write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep="")) + + write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep="")) - cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep="")) + cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep="")) + cat(length(tmp$Sequence.ID), file=paste(name, "_", fname, "_n.txt" ,sep="")) + + matrx } -#again for all of the data -tmp = dat -j = i -x = (j * 3) + 1 -y = (j * 3) + 2 -z = (j * 3) + 3 -matrx[1,x] = sum(tmp$VRegionMutations) -matrx[1,y] = sum(tmp$VRegionNucleotides) -matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) +nts = c("a", "c", "g", "t") +zeros=rep(0, 4) -matrx[2,x] = sum(tmp$transitionMutations) -matrx[2,y] = sum(tmp$VRegionMutations) -matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) - -matrx[3,x] = sum(tmp$transversionMutations) -matrx[3,y] = sum(tmp$VRegionMutations) -matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) +funcs = c(median, sum, mean) +fnames = c("median", "sum", "mean") -matrx[4,x] = sum(tmp$transitionMutationsAtGC) -matrx[4,y] = sum(tmp$totalMutationsAtGC) -matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) - -matrx[5,x] = sum(tmp$totalMutationsAtGC) -matrx[5,y] = sum(tmp$VRegionMutations) -matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) - -matrx[6,x] = sum(tmp$transitionMutationsAtAT) -matrx[6,y] = sum(tmp$totalMutationsAtAT) -matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) - -matrx[7,x] = sum(tmp$totalMutationsAtAT) -matrx[7,y] = sum(tmp$VRegionMutations) -matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) - -matrx[8,x] = sum(tmp$nonSilentMutationsFR) -matrx[8,y] = sum(tmp$silentMutationsFR) -matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) +for(i in 1:length(funcs)){ + func = funcs[[i]] + fname = fnames[[i]] + + matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9) -matrx[9,x] = sum(tmp$nonSilentMutationsCDR) -matrx[9,y] = sum(tmp$silentMutationsCDR) -matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) - -transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) -row.names(transitionTable) = c("A", "C", "G", "T") -transitionTable["A","A"] = NA -transitionTable["C","C"] = NA -transitionTable["G","G"] = NA -transitionTable["T","T"] = NA - + for(i in 1:length(genes)){ + matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) + } -for(nt1 in nts){ - for(nt2 in nts){ - if(nt1 == nt2){ - next - } - NT1 = LETTERS[letters == nt1] - NT2 = LETTERS[letters == nt2] - FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") - CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") - FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") - CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") - FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - if(include_fr1){ - transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) - } else { - transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) - } - } + matrx = calculate_result(i + 1, ".*", dat, matrx, func, fname, name="all") + + result = data.frame(matrx) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + + write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) } -write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) -write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T) -cat(matrx[1,x], file="total_value.txt") -cat(length(tmp$Sequence.ID), file="total_n.txt") - - - -result = data.frame(matrx) -row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") - -write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) if (!("ggplot2" %in% rownames(installed.packages()))) { diff -r d3542f87a304 -r 7290a88ea202 mutation_analysis.r.bak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r.bak Mon Feb 29 10:49:39 2016 -0500 @@ -0,0 +1,469 @@ +library(data.table) +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +genes = unlist(strsplit(args[2], ",")) +outputdir = args[3] +print(args[4]) +include_fr1 = ifelse(args[4] == "yes", T, F) +setwd(outputdir) + +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +if(length(dat$Sequence.ID) == 0){ + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") +} + + + +cleanup_columns = c("FR1.IMGT.c.a", + "FR2.IMGT.g.t", + "CDR1.IMGT.Nb.of.nucleotides", + "CDR2.IMGT.t.a", + "FR1.IMGT.c.g", + "CDR1.IMGT.c.t", + "FR2.IMGT.a.c", + "FR2.IMGT.Nb.of.mutations", + "FR2.IMGT.g.c", + "FR2.IMGT.a.g", + "FR3.IMGT.t.a", + "FR3.IMGT.t.c", + "FR2.IMGT.g.a", + "FR3.IMGT.c.g", + "FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.g.a", + "CDR1.IMGT.t.g", + "CDR1.IMGT.g.c", + "CDR2.IMGT.Nb.of.nucleotides", + "FR2.IMGT.a.t", + "CDR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.a.g", + "FR3.IMGT.a.c", + "FR1.IMGT.g.a", + "FR3.IMGT.a.g", + "FR1.IMGT.a.t", + "CDR2.IMGT.a.g", + "CDR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.g.t", + "CDR2.IMGT.a.c", + "CDR1.IMGT.t.c", + "FR3.IMGT.g.c", + "FR1.IMGT.g.t", + "FR3.IMGT.g.t", + "CDR1.IMGT.a.t", + "FR1.IMGT.a.g", + "FR3.IMGT.a.t", + "FR3.IMGT.Nb.of.nucleotides", + "FR2.IMGT.t.c", + "CDR2.IMGT.g.a", + "FR2.IMGT.t.a", + "CDR1.IMGT.t.a", + "FR2.IMGT.t.g", + "FR3.IMGT.t.g", + "FR2.IMGT.Nb.of.nucleotides", + "FR1.IMGT.t.a", + "FR1.IMGT.t.g", + "FR3.IMGT.c.t", + "FR1.IMGT.t.c", + "CDR2.IMGT.a.t", + "FR2.IMGT.c.t", + "CDR1.IMGT.g.t", + "CDR2.IMGT.t.g", + "FR1.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.c.g", + "CDR2.IMGT.t.c", + "FR3.IMGT.g.a", + "CDR1.IMGT.a.c", + "FR2.IMGT.c.a", + "FR3.IMGT.Nb.of.mutations", + "FR2.IMGT.c.g", + "CDR2.IMGT.g.c", + "FR1.IMGT.g.c", + "CDR2.IMGT.c.t", + "FR3.IMGT.c.a", + "CDR1.IMGT.c.a", + "CDR2.IMGT.c.g", + "CDR2.IMGT.c.a", + "FR1.IMGT.c.t", + "FR1.IMGT.Nb.of.silent.mutations", + "FR2.IMGT.Nb.of.silent.mutations", + "FR3.IMGT.Nb.of.silent.mutations", + "FR1.IMGT.Nb.of.nonsilent.mutations", + "FR2.IMGT.Nb.of.nonsilent.mutations", + "FR3.IMGT.Nb.of.nonsilent.mutations") + +for(col in cleanup_columns){ + dat[,col] = gsub("\\(.*\\)", "", dat[,col]) + #dat[dat[,col] == "",] = "0" + dat[,col] = as.numeric(dat[,col]) + dat[is.na(dat[,col]),] = 0 +} + +regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") +if(!include_fr1){ + regions = c("CDR1", "FR2", "CDR2", "FR3") +} + +sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } + +VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") +dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) + +VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") +dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns) + +transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="") +dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) + +transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") +dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) + + +transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") +dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) + + +totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") +#totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") +dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) + +transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="") +dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns) + +totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") +#totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") +dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) + + +FRRegions = regions[grepl("FR", regions)] +CDRRegions = regions[grepl("CDR", regions)] + +FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") +dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns) + +CDR_silentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.silent.mutations", sep="") +dat$silentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_silentMutations_columns) + +FR_nonSilentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") +dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns) + +CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") +dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) + +mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") + +write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) + +setwd(outputdir) + +nts = c("a", "c", "g", "t") +zeros=rep(0, 4) +matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9) +for(i in 1:length(genes)){ + gene = genes[i] + tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] + if(gene == "."){ + tmp = dat + } + j = i - 1 + x = (j * 3) + 1 + y = (j * 3) + 2 + z = (j * 3) + 3 + matrx[1,x] = sum(tmp$VRegionMutations) + matrx[1,y] = sum(tmp$VRegionNucleotides) + matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + + matrx[2,x] = sum(tmp$transitionMutations) + matrx[2,y] = sum(tmp$VRegionMutations) + matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + + matrx[3,x] = sum(tmp$transversionMutations) + matrx[3,y] = sum(tmp$VRegionMutations) + matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + + matrx[4,x] = sum(tmp$transitionMutationsAtGC) + matrx[4,y] = sum(tmp$totalMutationsAtGC) + matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + + matrx[5,x] = sum(tmp$totalMutationsAtGC) + matrx[5,y] = sum(tmp$VRegionMutations) + matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + + matrx[6,x] = sum(tmp$transitionMutationsAtAT) + matrx[6,y] = sum(tmp$totalMutationsAtAT) + matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + + matrx[7,x] = sum(tmp$totalMutationsAtAT) + matrx[7,y] = sum(tmp$VRegionMutations) + matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + + matrx[8,x] = sum(tmp$nonSilentMutationsFR) + matrx[8,y] = sum(tmp$silentMutationsFR) + matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + + matrx[9,x] = sum(tmp$nonSilentMutationsCDR) + matrx[9,y] = sum(tmp$silentMutationsCDR) + matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) + + + transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + + if(nrow(tmp) > 0){ + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } + } + } + } + + + write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + + cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep="")) + cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep="")) +} + +#again for all of the data +tmp = dat +j = i +x = (j * 3) + 1 +y = (j * 3) + 2 +z = (j * 3) + 3 +matrx[1,x] = sum(tmp$VRegionMutations) +matrx[1,y] = sum(tmp$VRegionNucleotides) +matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + +matrx[2,x] = sum(tmp$transitionMutations) +matrx[2,y] = sum(tmp$VRegionMutations) +matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + +matrx[3,x] = sum(tmp$transversionMutations) +matrx[3,y] = sum(tmp$VRegionMutations) +matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + +matrx[4,x] = sum(tmp$transitionMutationsAtGC) +matrx[4,y] = sum(tmp$totalMutationsAtGC) +matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + +matrx[5,x] = sum(tmp$totalMutationsAtGC) +matrx[5,y] = sum(tmp$VRegionMutations) +matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + +matrx[6,x] = sum(tmp$transitionMutationsAtAT) +matrx[6,y] = sum(tmp$totalMutationsAtAT) +matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + +matrx[7,x] = sum(tmp$totalMutationsAtAT) +matrx[7,y] = sum(tmp$VRegionMutations) +matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + +matrx[8,x] = sum(tmp$nonSilentMutationsFR) +matrx[8,y] = sum(tmp$silentMutationsFR) +matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + +matrx[9,x] = sum(tmp$nonSilentMutationsCDR) +matrx[9,y] = sum(tmp$silentMutationsCDR) +matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) + +transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) +row.names(transitionTable) = c("A", "C", "G", "T") +transitionTable["A","A"] = NA +transitionTable["C","C"] = NA +transitionTable["G","G"] = NA +transitionTable["T","T"] = NA + + +for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } + } +} +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T) +cat(matrx[1,x], file="total_value.txt") +cat(length(tmp$Sequence.ID), file="total_n.txt") + + + +result = data.frame(matrx) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + + +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} + + +genesForPlot = gsub("[0-9]", "", dat$best_match) +genesForPlot = data.frame(table(genesForPlot)) +colnames(genesForPlot) = c("Gene","Freq") +genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) +write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T) + + +pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) +pc = pc + geom_bar(width = 1, stat = "identity") +pc = pc + coord_polar(theta="y") +pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")")) + +png(filename="all.png") +pc +dev.off() + + +#blegh +genesForPlot = dat[grepl("ca", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T) + + png(filename="ca.png") + print(pc) + dev.off() +} + +genesForPlot = dat[grepl("cg", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T) + + png(filename="cg.png") + print(pc) + dev.off() +} + +dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) + +p = ggplot(dat, aes(best_match, percentage_mutations)) +p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) +p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + +png(filename="scatter.png") +print(p) +dev.off() + +write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) + +write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) + + + + + + +dat$best_match_class = substr(dat$best_match, 0, 2) +freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") +dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) + +p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count)) +p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") +p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + +png(filename="frequency_ranges.png") +print(p) +dev.off() + +frequency_bins_data_by_class = frequency_bins_data + +write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")]) + +write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) + + +#frequency_bins_data_by_class +#frequency_ranges_subclasses.txt + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r d3542f87a304 -r 7290a88ea202 wrapper.sh --- a/wrapper.sh Fri Jan 29 08:11:31 2016 -0500 +++ b/wrapper.sh Mon Feb 29 10:49:39 2016 -0500 @@ -33,6 +33,8 @@ cat $PWD/files/*/8_* > $PWD/mutationstats.txt cat $PWD/files/*/10_* > $PWD/hotspots.txt +cp $dir/tabber.js $outdir + #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" echo "${BLASTN_DIR}" @@ -63,36 +65,54 @@ genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm" echo "R mutation analysis" Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1 + +#echo "." > $output +#exit 0 + echo "python mutation analysis" python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt echo "R AA histogram" Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1 -cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) +funcs=(sum mean median) -echo "

$title

" > $output -echo "" >> $output -for gene in ${genes[@]} + +echo "

$title

" >> $output + +for func in ${funcs[@]} do - tmp=`cat $outdir/${gene}_n.txt` - echo "" >> $output -done -tmp=`cat $outdir/total_n.txt` -echo "" >> $output + cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt + + echo "
info${gene} (N = $tmp)all (N = $tmp)
" >> $output + echo "info" >> $output + for gene in ${genes[@]} + do + tmp=`cat $outdir/${gene}_${func}_n.txt` + echo "" >> $output + done + tmp=`cat $outdir/all_${func}_n.txt` + echo "" >> $output -while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz -do - if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh - echo "" >> $output - else - echo "" >> $output - fi -done < $outdir/result.txt + while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz + do + if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh + echo "" >> $output + else + echo "" >> $output + fi + done < $outdir/result.txt + +done + echo "

${func} table

${gene} (N = $tmp)all (N = $tmp)
$name${cax}/${cay} (${caz})${ca1x}/${ca1y} (${ca1z})${ca2x}/${ca2y} (${ca2z})${cgx}/${cgy} (${cgz})${cg1x}/${cg1y} (${cg1z})${cg2x}/${cg2y} (${cg2z})${cg3x}/${cg3y} (${cg3z})${cg4x}/${cg4y} (${cg4z})${cmx}/${cmy} (${cmz})${allx}/${ally} (${allz})
$name${cax}/${cay} (${caz}%)${ca1x}/${ca1y} (${ca1z}%)${ca2x}/${ca2y} (${ca2z}%)${cgx}/${cgy} (${cgz}%)${cg1x}/${cg1y} (${cg1z}%)${cg2x}/${cg2y} (${cg2z}%)${cg3x}/${cg3y} (${cg3z}%)${cg4x}/${cg4y} (${cg4z}%)${cmx}/${cmy} (${cmz}%)${allx}/${ally} (${allz}%)
$name${cax}/${cay} (${caz})${ca1x}/${ca1y} (${ca1z})${ca2x}/${ca2y} (${ca2z})${cgx}/${cgy} (${cgz})${cg1x}/${cg1y} (${cg1z})${cg2x}/${cg2y} (${cg2z})${cg3x}/${cg3y} (${cg3z})${cg4x}/${cg4y} (${cg4z})${cmx}/${cmy} (${cmz})${allx}/${ally} (${allz})
$name${cax}/${cay} (${caz}%)${ca1x}/${ca1y} (${ca1z}%)${ca2x}/${ca2y} (${ca2z}%)${cgx}/${cgy} (${cgz}%)${cg1x}/${cg1y} (${cg1z}%)${cg2x}/${cg2y} (${cg2z}%)${cg3x}/${cg3y} (${cg3z}%)${cg4x}/${cg4y} (${cg4z}%)${cmx}/${cmy} (${cmz}%)${allx}/${ally} (${allz}%)
" >> $output -echo "unmatched
motif per sequence
all data
mutations by id
AA mutations location by id
Absant AA locations by id
" >> $output - +echo "unmatched
" >> $output +echo "motif per sequence
" >> $output +echo "all data
" >> $output +echo "mutations by id
" >> $output +echo "AA mutations location by id
" >> $output +echo "Absant AA locations by id
" >> $output echo "
" >> $output echo "download data
" >> $output @@ -129,7 +149,7 @@ while IFS=, read from a c g t do echo "$from$a$c$g$t" >> $output - done < $outdir/transitions_${gene}.txt + done < $outdir/transitions_${gene}_sum.txt echo "" >> $output done @@ -137,7 +157,7 @@ while IFS=, read from a c g t do echo "$from$a$c$g$t" >> $output -done < $outdir/transitions.txt +done < $outdir/transitions_all_sum.txt echo "" >> $output echo "" >> $output