diff mutation_analysis.r @ 4:069419cccba4 draft

Uploaded
author davidvanzessen
date Mon, 22 Sep 2014 10:19:36 -0400
parents a0b27058dcac
children cb7c65e3e43f
line wrap: on
line diff
--- a/mutation_analysis.r	Wed Sep 17 07:25:17 2014 -0400
+++ b/mutation_analysis.r	Mon Sep 22 10:19:36 2014 -0400
@@ -1,38 +1,29 @@
 args <- commandArgs(trailingOnly = TRUE)
 
 input = args[1]
-summaryinput = args[2]
+genes = unlist(strsplit(args[2], ","))
 outputdir = args[3]
 setwd(outputdir)
 
-#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
 
-datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
-datSum = datSum[,c("Sequence.ID","J.GENE.and.allele", "AA.JUNCTION")]
-
-dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
-
 if(length(dat$Sequence.ID) == 0){
-	setwd(outputdir)
-	result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
-	row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
-	write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
-	transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
-	row.names(transitionTable) = c("A", "C", "G", "T")
-	transitionTable["A","A"] = NA
-	transitionTable["C","C"] = NA
-	transitionTable["G","G"] = NA
-	transitionTable["T","T"] = NA
-	write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
-	cat("0", file="n.txt")
-	stop("No data")
+  setwd(outputdir)
+  result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
+  row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+  write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+  transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
+  row.names(transitionTable) = c("A", "C", "G", "T")
+  transitionTable["A","A"] = NA
+  transitionTable["C","C"] = NA
+  transitionTable["G","G"] = NA
+  transitionTable["T","T"] = NA
+  write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+  cat("0", file="n.txt")
+  stop("No data")
 }
 
-#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
 
-result = data.frame(x = 1:5, y = 1:5, z = 1:5)
-row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
 
 cleanup_columns = c("FR1.IMGT.c.a",
                     "FR2.IMGT.g.t",
@@ -111,32 +102,19 @@
   dat[is.na(dat[,col]),] = 0
 }
 
-dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
-dat$VGene = gsub("[*].*", "", dat$VGene)
-dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
-dat$JGene = gsub("[*].*", "", dat$JGene)
-
-dat$past = paste(dat$AA.JUNCTION, dat$VGene, dat$JGene, (dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + dat$FR3.IMGT.Nb.of.mutations))
-
-dat = dat[!duplicated(dat$past), ]
-
-VRegionMutations =  sum(dat$FR1.IMGT.Nb.of.mutations + 
+dat$VRegionMutations =  dat$FR1.IMGT.Nb.of.mutations + 
                         dat$CDR1.IMGT.Nb.of.mutations + 
                         dat$FR2.IMGT.Nb.of.mutations + 
                         dat$CDR2.IMGT.Nb.of.mutations +
-                        dat$FR3.IMGT.Nb.of.mutations)
+                        dat$FR3.IMGT.Nb.of.mutations
 
-VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
+dat$VRegionNucleotides =  dat$FR1.IMGT.Nb.of.nucleotides +
                           dat$CDR1.IMGT.Nb.of.nucleotides +
                           dat$FR2.IMGT.Nb.of.nucleotides +
                           dat$CDR2.IMGT.Nb.of.nucleotides +
-                          dat$FR3.IMGT.Nb.of.nucleotides)
+                          dat$FR3.IMGT.Nb.of.nucleotides
 
-result[1,"x"] = VRegionMutations
-result[1,"y"] = VRegionNucleotides
-result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
-
-transitionMutations = sum(dat$FR1.IMGT.a.g +
+dat$transitionMutations = dat$FR1.IMGT.a.g +
                           dat$FR1.IMGT.g.a +
                           dat$FR1.IMGT.c.t +
                           dat$FR1.IMGT.t.c +
@@ -155,58 +133,51 @@
                           dat$FR3.IMGT.a.g +
                           dat$FR3.IMGT.g.a +
                           dat$FR3.IMGT.c.t +
-                          dat$FR3.IMGT.t.c)
-
-result[2,"x"] = transitionMutations
-result[2,"y"] = VRegionMutations
-result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
+                          dat$FR3.IMGT.t.c
 
-transversionMutations = sum(  dat$FR1.IMGT.a.c +
-                              dat$FR1.IMGT.c.a +
-                              dat$FR1.IMGT.a.t +
-                              dat$FR1.IMGT.t.a +
-                              dat$FR1.IMGT.g.c +
-                              dat$FR1.IMGT.c.g +
-                              dat$FR1.IMGT.g.t +
-                              dat$FR1.IMGT.t.g +
-                              dat$CDR1.IMGT.a.c +
-                              dat$CDR1.IMGT.c.a +
-                              dat$CDR1.IMGT.a.t +
-                              dat$CDR1.IMGT.t.a +
-                              dat$CDR1.IMGT.g.c +
-                              dat$CDR1.IMGT.c.g +
-                              dat$CDR1.IMGT.g.t +
-                              dat$CDR1.IMGT.t.g +
-                              dat$FR2.IMGT.a.c +
-                              dat$FR2.IMGT.c.a +
-                              dat$FR2.IMGT.a.t +
-                              dat$FR2.IMGT.t.a +
-                              dat$FR2.IMGT.g.c +
-                              dat$FR2.IMGT.c.g +
-                              dat$FR2.IMGT.g.t +
-                              dat$FR2.IMGT.t.g +
-                              dat$CDR2.IMGT.a.c +
-                              dat$CDR2.IMGT.c.a +
-                              dat$CDR2.IMGT.a.t +
-                              dat$CDR2.IMGT.t.a +
-                              dat$CDR2.IMGT.g.c +
-                              dat$CDR2.IMGT.c.g +
-                              dat$CDR2.IMGT.g.t +
-                              dat$CDR2.IMGT.t.g +
-                              dat$FR3.IMGT.a.c +
-                              dat$FR3.IMGT.c.a +
-                              dat$FR3.IMGT.a.t +
-                              dat$FR3.IMGT.t.a +
-                              dat$FR3.IMGT.g.c +
-                              dat$FR3.IMGT.c.g +
-                              dat$FR3.IMGT.g.t +
-                              dat$FR3.IMGT.t.g)
+dat$transversionMutations = dat$FR1.IMGT.a.c +
+                            dat$FR1.IMGT.c.a +
+                            dat$FR1.IMGT.a.t +
+                            dat$FR1.IMGT.t.a +
+                            dat$FR1.IMGT.g.c +
+                            dat$FR1.IMGT.c.g +
+                            dat$FR1.IMGT.g.t +
+                            dat$FR1.IMGT.t.g +
+                            dat$CDR1.IMGT.a.c +
+                            dat$CDR1.IMGT.c.a +
+                            dat$CDR1.IMGT.a.t +
+                            dat$CDR1.IMGT.t.a +
+                            dat$CDR1.IMGT.g.c +
+                            dat$CDR1.IMGT.c.g +
+                            dat$CDR1.IMGT.g.t +
+                            dat$CDR1.IMGT.t.g +
+                            dat$FR2.IMGT.a.c +
+                            dat$FR2.IMGT.c.a +
+                            dat$FR2.IMGT.a.t +
+                            dat$FR2.IMGT.t.a +
+                            dat$FR2.IMGT.g.c +
+                            dat$FR2.IMGT.c.g +
+                            dat$FR2.IMGT.g.t +
+                            dat$FR2.IMGT.t.g +
+                            dat$CDR2.IMGT.a.c +
+                            dat$CDR2.IMGT.c.a +
+                            dat$CDR2.IMGT.a.t +
+                            dat$CDR2.IMGT.t.a +
+                            dat$CDR2.IMGT.g.c +
+                            dat$CDR2.IMGT.c.g +
+                            dat$CDR2.IMGT.g.t +
+                            dat$CDR2.IMGT.t.g +
+                            dat$FR3.IMGT.a.c +
+                            dat$FR3.IMGT.c.a +
+                            dat$FR3.IMGT.a.t +
+                            dat$FR3.IMGT.t.a +
+                            dat$FR3.IMGT.g.c +
+                            dat$FR3.IMGT.c.g +
+                            dat$FR3.IMGT.g.t +
+                            dat$FR3.IMGT.t.g
 
-result[3,"x"] = transversionMutations
-result[3,"y"] = VRegionMutations
-result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
 
-transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+dat$transitionMutationsAtGC = dat$FR1.IMGT.g.a +
                               dat$FR1.IMGT.c.t +
                               dat$CDR1.IMGT.g.a +
                               dat$CDR1.IMGT.c.t +
@@ -215,9 +186,9 @@
                               dat$CDR2.IMGT.g.a +
                               dat$CDR2.IMGT.c.t +
                               dat$FR3.IMGT.g.a +
-                              dat$FR3.IMGT.c.t)
+                              dat$FR3.IMGT.c.t
 
-totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+dat$totalMutationsAtGC = dat$FR1.IMGT.g.a +
                          dat$FR1.IMGT.c.t +
                          dat$FR1.IMGT.c.a +
                          dat$FR1.IMGT.g.c +
@@ -246,18 +217,99 @@
                          dat$FR3.IMGT.c.a +
                          dat$FR3.IMGT.g.c +
                          dat$FR3.IMGT.c.g +
-                         dat$FR3.IMGT.g.t)
+                         dat$FR3.IMGT.g.t
 
-result[4,"x"] = transitionMutationsAtGC
-result[4,"y"] = totalMutationsAtGC
-result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
-
-result[5,"x"] = totalMutationsAtGC
-result[5,"y"] = VRegionMutations
-result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
 
 
-#transitiontable
+setwd(outputdir)
+
+matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5)
+for(i in 1:length(genes)){
+  gene = genes[i]
+  tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
+  if(gene == "."){
+    tmp = dat
+  }
+  if(length(tmp) == 0){
+		cat("0", file=paste(gene, "_value.txt" ,sep=""))
+    next
+  }
+  j = i - 1
+  x = (j * 3) + 1
+  y = (j * 3) + 2
+  z = (j * 3) + 3
+  matrx[1,x] = sum(tmp$VRegionMutations)
+  matrx[1,y] = sum(tmp$VRegionNucleotides)
+  matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+  matrx[2,x] = sum(tmp$transitionMutations)
+  matrx[2,y] = sum(tmp$VRegionMutations)
+  matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+  matrx[3,x] = sum(tmp$transversionMutations)
+  matrx[3,y] = sum(tmp$VRegionMutations)
+  matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+  matrx[4,x] = sum(tmp$transitionMutationsAtGC)
+  matrx[4,y] = sum(tmp$totalMutationsAtGC)
+  matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+  matrx[5,x] = sum(tmp$totalMutationsAtGC)
+  matrx[5,y] = sum(tmp$VRegionMutations)
+  matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
+  
+  transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
+  row.names(transitionTable) = c("A", "C", "G", "T")
+  transitionTable["A","A"] = NA
+  transitionTable["C","C"] = NA
+  transitionTable["G","G"] = NA
+  transitionTable["T","T"] = NA
+  nts = c("a", "c", "g", "t")
+  
+  
+  for(nt1 in nts){
+    for(nt2 in nts){
+      if(nt1 == nt2){
+        next
+      }
+      NT1 = LETTERS[letters == nt1]
+      NT2 = LETTERS[letters == nt2]
+      FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+      CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+      FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+      CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+      FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+      transitionTable[NT1,NT2] = sum( tmp[,FR1] + 
+                                        tmp[,CDR1] +
+                                        tmp[,FR2] +
+                                        tmp[,CDR2] +
+                                        tmp[,FR3])
+    }
+  }
+  write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
+  write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+  
+  cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep=""))
+  cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep=""))
+}
+
+#again for all of the data
+tmp = dat
+j = i
+x = (j * 3) + 1
+y = (j * 3) + 2
+z = (j * 3) + 3
+matrx[1,x] = sum(tmp$VRegionMutations)
+matrx[1,y] = sum(tmp$VRegionNucleotides)
+matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+matrx[2,x] = sum(tmp$transitionMutations)
+matrx[2,y] = sum(tmp$VRegionMutations)
+matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+matrx[3,x] = sum(tmp$transversionMutations)
+matrx[3,y] = sum(tmp$VRegionMutations)
+matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+matrx[4,x] = sum(tmp$transitionMutationsAtGC)
+matrx[4,y] = sum(tmp$totalMutationsAtGC)
+matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+matrx[5,x] = sum(tmp$totalMutationsAtGC)
+matrx[5,y] = sum(tmp$VRegionMutations)
+matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
 
 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
 row.names(transitionTable) = c("A", "C", "G", "T")
@@ -269,40 +321,88 @@
 
 
 for(nt1 in nts){
-  for(nt2 in nts){
-    if(nt1 == nt2){
-      next
-    }
-    NT1 = LETTERS[letters == nt1]
-    NT2 = LETTERS[letters == nt2]
-    FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
-    CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
-    FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
-    CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
-    FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
-    transitionTable[NT1,NT2] = sum( dat[,FR1] + 
-                                    dat[,CDR1] +
-                                    dat[,FR2] +
-                                    dat[,CDR2] +
-                                    dat[,FR3])
-  }
+	for(nt2 in nts){
+		if(nt1 == nt2){
+			next
+		}
+		NT1 = LETTERS[letters == nt1]
+		NT2 = LETTERS[letters == nt2]
+		FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+		CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+		FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+		CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+		FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+		transitionTable[NT1,NT2] = sum( tmp[,FR1] + 
+																			tmp[,CDR1] +
+																			tmp[,FR2] +
+																			tmp[,CDR2] +
+																			tmp[,FR3])
+	}
+}
+write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T)
+cat(matrx[1,x], file="total_value.txt")
+cat(length(tmp$Sequence.ID), file="total_n.txt")
+
+
+
+result = data.frame(matrx)
+row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)")
+
+write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+
+
+if (!("ggplot2" %in% rownames(installed.packages()))) {
+	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+}
+library(ggplot2)
+
+genesForPlot = gsub("[0-9]", "", dat$best_match)
+genesForPlot = data.frame(table(genesForPlot))
+colnames(genesForPlot) = c("Gene","Freq")
+genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+pc = pc + geom_bar(width = 1, stat = "identity")
+pc = pc + coord_polar(theta="y")
+pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+
+png(filename="all.png")
+pc
+dev.off()
+
+
+#blegh
+genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
+if(length(genesForPlot) > 0){
+	genesForPlot = data.frame(table(genesForPlot))
+	colnames(genesForPlot) = c("Gene","Freq")
+	genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+	pc = pc + geom_bar(width = 1, stat = "identity")
+	pc = pc + coord_polar(theta="y")
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+
+
+	png(filename="ca.png")
+	print(pc)
+	dev.off()
 }
 
-setwd(outputdir)
-write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
-write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
-cat(length(dat$Sequence.ID), file="n.txt")
+genesForPlot = dat[grepl("cg", dat$best_match),]$best_match
+if(length(genesForPlot) > 0){
+	genesForPlot = data.frame(table(genesForPlot))
+	colnames(genesForPlot) = c("Gene","Freq")
+	genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+	pc = pc + geom_bar(width = 1, stat = "identity")
+	pc = pc + coord_polar(theta="y")
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")"))
 
 
-weblogo = dat[,c("Sequence.ID", "VGene")]
-weblogo$VGene = gsub("\\*.*", "", weblogo$VGene)
-
-rs12 = read.table("HS12RSS.txt", sep="\t", header=TRUE)
-rs23 = read.table("HS23RSS.txt", sep="\t", header=TRUE)
-
-result12 = merge(weblogo, rs12, by.x="VGene", by.y="Gene")
-result23 = merge(weblogo, rs23, by.x="VGene", by.y="Gene")
-
-write.table(x=result12$Sequence, file="weblogo_in_rs12.txt", sep=",",quote=F,row.names=F,col.names=F)
-write.table(x=result23$Sequence, file="weblogo_in_rs23.txt", sep=",",quote=F,row.names=F,col.names=F)
-
+	png(filename="cg.png")
+	print(pc)
+	dev.off()
+}