diff mutation_analysis.r @ 49:5c6b9e99d576 draft

Uploaded
author davidvanzessen
date Wed, 18 Nov 2015 05:55:04 -0500
parents 099cc1254f74
children 7290a88ea202
line wrap: on
line diff
--- a/mutation_analysis.r	Thu Nov 12 09:46:37 2015 -0500
+++ b/mutation_analysis.r	Wed Nov 18 05:55:04 2015 -0500
@@ -1,3 +1,6 @@
+library(data.table)
+library(ggplot2)
+
 args <- commandArgs(trailingOnly = TRUE)
 
 input = args[1]
@@ -133,9 +136,19 @@
 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
 
-totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="")
+
+totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="")
+#totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="")
 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
 
+transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="")
+dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns)
+
+totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="")
+#totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="")
+dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns)
+
+
 FRRegions = regions[grepl("FR", regions)]
 CDRRegions = regions[grepl("CDR", regions)]
 
@@ -151,7 +164,7 @@
 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns)
 
-mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR")
+mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR")
 
 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T)
 
@@ -159,7 +172,7 @@
 
 nts = c("a", "c", "g", "t")
 zeros=rep(0, 4)
-matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7)
+matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9)
 for(i in 1:length(genes)){
   gene = genes[i]
   tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
@@ -173,24 +186,38 @@
   matrx[1,x] = sum(tmp$VRegionMutations)
   matrx[1,y] = sum(tmp$VRegionNucleotides)
   matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+
   matrx[2,x] = sum(tmp$transitionMutations)
   matrx[2,y] = sum(tmp$VRegionMutations)
   matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+
   matrx[3,x] = sum(tmp$transversionMutations)
   matrx[3,y] = sum(tmp$VRegionMutations)
   matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+
   matrx[4,x] = sum(tmp$transitionMutationsAtGC)
   matrx[4,y] = sum(tmp$totalMutationsAtGC)
   matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+
   matrx[5,x] = sum(tmp$totalMutationsAtGC)
   matrx[5,y] = sum(tmp$VRegionMutations)
   matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
-  matrx[6,x] = sum(tmp$nonSilentMutationsFR)
-  matrx[6,y] = sum(tmp$silentMutationsFR)
-  matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1)
-  matrx[7,x] = sum(tmp$nonSilentMutationsCDR)
-  matrx[7,y] = sum(tmp$silentMutationsCDR)
-  matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1)
+
+  matrx[6,x] = sum(tmp$transitionMutationsAtAT)
+  matrx[6,y] = sum(tmp$totalMutationsAtAT)
+  matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1)
+
+  matrx[7,x] = sum(tmp$totalMutationsAtAT)
+  matrx[7,y] = sum(tmp$VRegionMutations)
+  matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1)
+
+  matrx[8,x] = sum(tmp$nonSilentMutationsFR)
+  matrx[8,y] = sum(tmp$silentMutationsFR)
+  matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1)
+
+  matrx[9,x] = sum(tmp$nonSilentMutationsCDR)
+  matrx[9,y] = sum(tmp$silentMutationsCDR)
+  matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1)
   
   
   transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros)
@@ -239,24 +266,38 @@
 matrx[1,x] = sum(tmp$VRegionMutations)
 matrx[1,y] = sum(tmp$VRegionNucleotides)
 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+
 matrx[2,x] = sum(tmp$transitionMutations)
 matrx[2,y] = sum(tmp$VRegionMutations)
 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+
 matrx[3,x] = sum(tmp$transversionMutations)
 matrx[3,y] = sum(tmp$VRegionMutations)
 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+
 matrx[4,x] = sum(tmp$transitionMutationsAtGC)
 matrx[4,y] = sum(tmp$totalMutationsAtGC)
 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+
 matrx[5,x] = sum(tmp$totalMutationsAtGC)
 matrx[5,y] = sum(tmp$VRegionMutations)
 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
-matrx[6,x] = sum(tmp$nonSilentMutationsFR)
-matrx[6,y] = sum(tmp$silentMutationsFR)
-matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1)
-matrx[7,x] = sum(tmp$nonSilentMutationsCDR)
-matrx[7,y] = sum(tmp$silentMutationsCDR)
-matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1)
+
+matrx[6,x] = sum(tmp$transitionMutationsAtAT)
+matrx[6,y] = sum(tmp$totalMutationsAtAT)
+matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1)
+
+matrx[7,x] = sum(tmp$totalMutationsAtAT)
+matrx[7,y] = sum(tmp$VRegionMutations)
+matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1)
+
+matrx[8,x] = sum(tmp$nonSilentMutationsFR)
+matrx[8,y] = sum(tmp$silentMutationsFR)
+matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1)
+
+matrx[9,x] = sum(tmp$nonSilentMutationsCDR)
+matrx[9,y] = sum(tmp$silentMutationsCDR)
+matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1)
 
 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
 row.names(transitionTable) = c("A", "C", "G", "T")
@@ -293,7 +334,7 @@
 
 
 result = data.frame(matrx)
-row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
+row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
 
 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
 
@@ -301,7 +342,7 @@
 if (!("ggplot2" %in% rownames(installed.packages()))) {
 	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
 }
-library(ggplot2)
+
 
 genesForPlot = gsub("[0-9]", "", dat$best_match)
 genesForPlot = data.frame(table(genesForPlot))
@@ -360,13 +401,46 @@
 p = ggplot(dat, aes(best_match, percentage_mutations))
 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
-write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
-
 
 png(filename="scatter.png")
 print(p)
 dev.off()
 
+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
+write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
+
+
+
+
+
+
+dat$best_match_class = substr(dat$best_match, 0, 2)
+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
+
+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])
+
+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count))
+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge")
+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class")
+
+png(filename="frequency_ranges.png")
+print(p)
+dev.off()
+
+frequency_bins_data_by_class = frequency_bins_data
+
+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")])
+
+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
+
+#frequency_bins_data_by_class
+#frequency_ranges_subclasses.txt
+
 
 
 
@@ -393,8 +467,3 @@
 
 
 
-
-
-
-
-