Mercurial > repos > davidvanzessen > mutation_analysis
changeset 49:5c6b9e99d576 draft
Uploaded
| author | davidvanzessen | 
|---|---|
| date | Wed, 18 Nov 2015 05:55:04 -0500 | 
| parents | d09b1bdfd388 | 
| children | 8ba6afa1247a | 
| files | aa_histogram.r merge_and_filter.r mutation_analysis.py mutation_analysis.r wrapper.sh | 
| diffstat | 4 files changed, 250 insertions(+), 109 deletions(-) [+] | 
line wrap: on
 line diff
--- a/aa_histogram.r Thu Nov 12 09:46:37 2015 -0500 +++ b/aa_histogram.r Wed Nov 18 05:55:04 2015 -0500 @@ -5,20 +5,28 @@ input = args[1] outfile = args[2] -dat = read.table(input, header=F, sep=",") -dat=as.numeric(dat[1,]) -dat_sum = sum(dat) +dat = read.table(input, sep="\t", fill=T, header=T, quote="") + + + +mutations.at.position = as.numeric(dat[1,]) +aa.at.position = as.numeric(dat[2,]) -dat_freq = dat / dat_sum * 100 +dat_freq = mutations.at.position / aa.at.position dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) +options(width=220) + +print(dat[,20:40]) + m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) m = m + geom_histogram(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq)) -m = m + annotate("segment", x = 0.5, y = -0.3, xend=26.5, yend=-0.3, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.2, label="FR1") -m = m + annotate("segment", x = 26.5, y = -0.4, xend=38.5, yend=-0.4, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.3, label="CDR1") -m = m + annotate("segment", x = 38.5, y = -0.3, xend=55.5, yend=-0.3, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.2, label="FR2") -m = m + annotate("segment", x = 55.5, y = -0.4, xend=65.5, yend=-0.4, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.3, label="CDR2") -m = m + annotate("segment", x = 65.5, y = -0.3, xend=104.5, yend=-0.3, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.2, label="FR3") +m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") +m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") +m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2") +m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2") +m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3") +m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle("AA mutation frequency") write.table(dat_dt, paste(dirname(outfile), "/aa_histogram.txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) png(filename=outfile, width=1280, height=720)
--- a/mutation_analysis.py Thu Nov 12 09:46:37 2015 -0500 +++ b/mutation_analysis.py Wed Nov 18 05:55:04 2015 -0500 @@ -1,9 +1,10 @@ +from collections import defaultdict import re import argparse parser = argparse.ArgumentParser() parser.add_argument("--input", - help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") + help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") parser.add_argument("--genes", help="The genes available in the 'best_match' column") parser.add_argument("--includefr1", help="The genes available in the 'best_match' column") parser.add_argument("--output", help="Output file") @@ -33,6 +34,8 @@ IDlist = [] mutationList = [] mutationListByID = {} +cdr1LengthDic = {} +cdr2LengthDic = {} with open(infile, 'r') as i: for line in i: @@ -45,6 +48,8 @@ fr2Index = linesplt.index("FR2.IMGT") cdr2Index = linesplt.index("CDR2.IMGT") fr3Index = linesplt.index("FR3.IMGT") + cdr1LengthIndex = linesplt.index("CDR1.IMGT.Nb.of.nucleotides") + cdr2LengthIndex = linesplt.index("CDR2.IMGT.Nb.of.nucleotides") first = False continue linecount += 1 @@ -62,9 +67,17 @@ mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + cdr1Length = linesplt[cdr1LengthIndex] + cdr2Length = linesplt[cdr2LengthIndex] + + cdr1LengthDic[ID] = int(cdr1Length) / 3 + cdr2LengthDic[ID] = int(cdr2Length) / 3 + IDlist += [ID] -AA_mutation = [0] * (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent +AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent + +AA_mutation = [0] * AALength AA_mutation_empty = AA_mutation[:] aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" @@ -75,28 +88,74 @@ if mutation[4]: AA_mutation[int(mutation[4])] += 1 AA_mutation_for_ID[int(mutation[4])] += 1 - o.write(ID + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID]) + "\n") + o.write(ID + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") -#for mutation in mutationList: -# if mutation[4]: # if non silent mutation -# AA_mutation[int(mutation[4])] += 1 +#absent AA stuff +absentAACDR1Dic = defaultdict(list) +absentAACDR1Dic[5] = range(29,36) +absentAACDR1Dic[6] = range(29,35) +absentAACDR1Dic[7] = range(30,35) +absentAACDR1Dic[8] = range(30,34) +absentAACDR1Dic[9] = range(31,34) +absentAACDR1Dic[10] = range(31,33) +absentAACDR1Dic[11] = [32] + +absentAACDR2Dic = defaultdict(list) +absentAACDR2Dic[0] = range(55,65) +absentAACDR2Dic[1] = range(56,65) +absentAACDR2Dic[2] = range(56,64) +absentAACDR2Dic[3] = range(57,64) +absentAACDR2Dic[4] = range(57,63) +absentAACDR2Dic[5] = range(58,63) +absentAACDR2Dic[6] = range(58,62) +absentAACDR2Dic[7] = range(59,62) +absentAACDR2Dic[8] = range(59,61) +absentAACDR2Dic[9] = [60] + +absentAA = [len(IDlist)] * (AALength-1) +for k, cdr1Length in cdr1LengthDic.iteritems(): + for c in absentAACDR1Dic[cdr1Length]: + absentAA[c] -= 1 + +for k, cdr2Length in cdr2LengthDic.iteritems(): + for c in absentAACDR2Dic[cdr2Length]: + absentAA[c] -= 1 + + +aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" +with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tcdr1length\tcdr2length\t" + "\t".join([str(x) for x in range(1,AALength-1)]) + "\n") + for ID in IDlist: + absentAAbyID = [1] * (AALength-1) + cdr1Length = cdr1LengthDic[ID] + for c in absentAACDR1Dic[cdr1Length]: + absentAAbyID[c] -= 1 + + cdr2Length = cdr2LengthDic[ID] + for c in absentAACDR2Dic[cdr2Length]: + absentAAbyID[c] -= 1 + o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") + + aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt" with open(aa_mutations_file, 'w') as o: - o.write(",".join([str(x) for x in AA_mutation]) + "\n") + o.write("row.name\t" + "\t".join([str(x) for x in range(1, AALength-1)]) + "\n") + o.write("mutations.at.position\t" + "\t".join([str(x) for x in AA_mutation[1:]]) + "\n") + o.write("AA.at.position\t" + "\t".join([str(x) for x in absentAA]) + "\n") if linecount == 0: - print "No data, exiting" - with open(outfile, 'w') as o: - o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) - o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) - o.write("WA (%)," + ("0,0,0\n" * len(genes))) - o.write("TW (%)," + ("0,0,0\n" * len(genes))) - import sys + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) + o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) + o.write("WA (%)," + ("0,0,0\n" * len(genes))) + o.write("TW (%)," + ("0,0,0\n" * len(genes))) + import sys - sys.exit() + sys.exit() hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") RGYWCount = {g: 0 for g in genes} @@ -111,80 +170,80 @@ atagcctIndex = 0 first = True with open(infile, 'r') as i: - for line in i: - if first: - linesplt = line.split("\t") - ataIndex = linesplt.index("X.a.t.a") - tatIndex = linesplt.index("t.a.t.") - aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") - atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") - first = False - continue - linesplt = line.split("\t") - gene = linesplt[best_matchIndex] - ID = linesplt[IDIndex] - RGYW = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] - WRCY = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] - WA = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] - TW = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] - RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 + for line in i: + if first: + linesplt = line.split("\t") + ataIndex = linesplt.index("X.a.t.a") + tatIndex = linesplt.index("t.a.t.") + aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") + atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") + first = False + continue + linesplt = line.split("\t") + gene = linesplt[best_matchIndex] + ID = linesplt[IDIndex] + RGYW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] + WRCY = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] + WA = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] + TW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] + RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 - mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ - ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] - for mutation in mutationList: - frm, where, to, AAfrm, AAwhere, AAto, junk = mutation - mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) - mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) - mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) - mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) + mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ + ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) + mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) + mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) + mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) - in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) + in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) - if in_how_many_motifs > 0: - RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs - WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs - WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs - TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + if in_how_many_motifs > 0: + RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs + WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs + WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs + TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs directory = outfile[:outfile.rfind("/") + 1] value = 0 valuedic = dict() for gene in genes: - with open(directory + gene + "_value.txt", 'r') as v: - valuedic[gene] = int(v.readlines()[0].rstrip()) + with open(directory + gene + "_value.txt", 'r') as v: + valuedic[gene] = int(v.readlines()[0].rstrip()) with open(directory + "total_value.txt", 'r') as v: - valuedic["total"] = int(v.readlines()[0].rstrip()) + valuedic["total"] = int(v.readlines()[0].rstrip()) dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} arr = ["RGYW", "WRCY", "WA", "TW"] with open(outfile, 'w') as o: - for typ in arr: - o.write(typ + " (%)") - curr = dic[typ] - for gene in genes: - geneMatcher = re.compile(".*" + gene + ".*") - if valuedic[gene] is 0: - o.write(",0,0,0") - else: - x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) - y = valuedic[gene] - z = str(round(x / float(valuedic[gene]) * 100, 1)) - o.write("," + str(x) + "," + str(y) + "," + z) - # for total - x = int(round(sum([y for x, y in curr.iteritems()]))) - y = valuedic["total"] - z = str(round(x / float(valuedic["total"]) * 100, 1)) - o.write("," + str(x) + "," + str(y) + "," + z + "\n") + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = re.compile(".*" + gene + ".*") + if valuedic[gene] is 0: + o.write(",0,0,0") + else: + x = int(round(sum([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]]))) + y = valuedic[gene] + z = str(round(x / float(valuedic[gene]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z) + # for total + x = int(round(sum([y for x, y in curr.iteritems()]))) + y = valuedic["total"] + z = str(round(x / float(valuedic["total"]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z + "\n") # for testing seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" with open(seq_motif_file, 'w') as o: - o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") - for ID in IDlist: - o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( - round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") + o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") + for ID in IDlist: + o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str( + round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n")
--- a/mutation_analysis.r Thu Nov 12 09:46:37 2015 -0500 +++ b/mutation_analysis.r Wed Nov 18 05:55:04 2015 -0500 @@ -1,3 +1,6 @@ +library(data.table) +library(ggplot2) + args <- commandArgs(trailingOnly = TRUE) input = args[1] @@ -133,9 +136,19 @@ transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) -totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="") + +totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") +#totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) +transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="") +dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns) + +totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") +#totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") +dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) + + FRRegions = regions[grepl("FR", regions)] CDRRegions = regions[grepl("CDR", regions)] @@ -151,7 +164,7 @@ CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) -mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") +mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) @@ -159,7 +172,7 @@ nts = c("a", "c", "g", "t") zeros=rep(0, 4) -matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7) +matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9) for(i in 1:length(genes)){ gene = genes[i] tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] @@ -173,24 +186,38 @@ matrx[1,x] = sum(tmp$VRegionMutations) matrx[1,y] = sum(tmp$VRegionNucleotides) matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + matrx[2,x] = sum(tmp$transitionMutations) matrx[2,y] = sum(tmp$VRegionMutations) matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + matrx[3,x] = sum(tmp$transversionMutations) matrx[3,y] = sum(tmp$VRegionMutations) matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + matrx[4,x] = sum(tmp$transitionMutationsAtGC) matrx[4,y] = sum(tmp$totalMutationsAtGC) matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + matrx[5,x] = sum(tmp$totalMutationsAtGC) matrx[5,y] = sum(tmp$VRegionMutations) matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) - matrx[6,x] = sum(tmp$nonSilentMutationsFR) - matrx[6,y] = sum(tmp$silentMutationsFR) - matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) - matrx[7,x] = sum(tmp$nonSilentMutationsCDR) - matrx[7,y] = sum(tmp$silentMutationsCDR) - matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) + + matrx[6,x] = sum(tmp$transitionMutationsAtAT) + matrx[6,y] = sum(tmp$totalMutationsAtAT) + matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + + matrx[7,x] = sum(tmp$totalMutationsAtAT) + matrx[7,y] = sum(tmp$VRegionMutations) + matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + + matrx[8,x] = sum(tmp$nonSilentMutationsFR) + matrx[8,y] = sum(tmp$silentMutationsFR) + matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + + matrx[9,x] = sum(tmp$nonSilentMutationsCDR) + matrx[9,y] = sum(tmp$silentMutationsCDR) + matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) @@ -239,24 +266,38 @@ matrx[1,x] = sum(tmp$VRegionMutations) matrx[1,y] = sum(tmp$VRegionNucleotides) matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + matrx[2,x] = sum(tmp$transitionMutations) matrx[2,y] = sum(tmp$VRegionMutations) matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + matrx[3,x] = sum(tmp$transversionMutations) matrx[3,y] = sum(tmp$VRegionMutations) matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + matrx[4,x] = sum(tmp$transitionMutationsAtGC) matrx[4,y] = sum(tmp$totalMutationsAtGC) matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + matrx[5,x] = sum(tmp$totalMutationsAtGC) matrx[5,y] = sum(tmp$VRegionMutations) matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) -matrx[6,x] = sum(tmp$nonSilentMutationsFR) -matrx[6,y] = sum(tmp$silentMutationsFR) -matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) -matrx[7,x] = sum(tmp$nonSilentMutationsCDR) -matrx[7,y] = sum(tmp$silentMutationsCDR) -matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) + +matrx[6,x] = sum(tmp$transitionMutationsAtAT) +matrx[6,y] = sum(tmp$totalMutationsAtAT) +matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + +matrx[7,x] = sum(tmp$totalMutationsAtAT) +matrx[7,y] = sum(tmp$VRegionMutations) +matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + +matrx[8,x] = sum(tmp$nonSilentMutationsFR) +matrx[8,y] = sum(tmp$silentMutationsFR) +matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + +matrx[9,x] = sum(tmp$nonSilentMutationsCDR) +matrx[9,y] = sum(tmp$silentMutationsCDR) +matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) row.names(transitionTable) = c("A", "C", "G", "T") @@ -293,7 +334,7 @@ result = data.frame(matrx) -row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)", "FR R/S (ratio)", "CDR R/S (ratio)") +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) @@ -301,7 +342,7 @@ if (!("ggplot2" %in% rownames(installed.packages()))) { install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") } -library(ggplot2) + genesForPlot = gsub("[0-9]", "", dat$best_match) genesForPlot = data.frame(table(genesForPlot)) @@ -360,13 +401,46 @@ p = ggplot(dat, aes(best_match, percentage_mutations)) p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") -write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) - png(filename="scatter.png") print(p) dev.off() +write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) + +write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) + + + + + + +dat$best_match_class = substr(dat$best_match, 0, 2) +freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") +dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) + +p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count)) +p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") +p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + +png(filename="frequency_ranges.png") +print(p) +dev.off() + +frequency_bins_data_by_class = frequency_bins_data + +write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")]) + +write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) + + +#frequency_bins_data_by_class +#frequency_ranges_subclasses.txt + @@ -393,8 +467,3 @@ - - - - -
--- a/wrapper.sh Thu Nov 12 09:46:37 2015 -0500 +++ b/wrapper.sh Wed Nov 18 05:55:04 2015 -0500 @@ -66,7 +66,6 @@ python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt echo "R AA histogram" Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1 - cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) @@ -91,7 +90,7 @@ fi done < $outdir/result.txt echo "</table>" >> $output -echo "<a href='unmatched.txt'>unmatched</a><br /><a href='motif_per_seq.txt'>motif per sequence</a><br /><a href='merged.txt'>all data</a><br /><a href='mutation_by_id.txt'>mutations by id</a><br /><a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output +echo "<a href='unmatched.txt'>unmatched</a><br /><a href='motif_per_seq.txt'>motif per sequence</a><br /><a href='merged.txt'>all data</a><br /><a href='mutation_by_id.txt'>mutations by id</a><br /><a href='aa_id_mutations.txt'>AA mutations location by id</a><br /><a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output echo "<img src='all.png'/><br />" >> $output @@ -111,6 +110,12 @@ echo "<img src='scatter.png'/><br />" >> $output echo "<a href='scatter.txt'>download data</a><br />" >> $output fi +if [ -a $outdir/frequency_ranges.png ] +then + echo "<img src='frequency_ranges.png'/><br />" >> $output + echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output + echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output +fi if [ -a $outdir/aa_histogram.png ] then echo "<img src='aa_histogram.png'/><br />" >> $output
