changeset 32:548b3035bbd7 draft

Uploaded
author davidvanzessen
date Fri, 08 Nov 2013 05:29:29 -0500
parents d26e8c208b5f
children 8d9be90792b5
files RScript.r
diffstat 1 files changed, 121 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/RScript.r	Fri Nov 08 05:29:20 2013 -0500
+++ b/RScript.r	Fri Nov 08 05:29:29 2013 -0500
@@ -4,12 +4,19 @@
 
 inFile = args[1]
 outFile = args[2]
+outDir = args[3]
 
-#install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
-library (gridExtra)
-#install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+if (!require("gridExtra")) {
+install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
+}
+library(gridExtra)
+if (!require("ggplot2")) {
+install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+}
 require(ggplot2)
-#install.packages("plyr", repos="http://cran.xl-mirror.nl/") 		
+if (!require("plyr")) {
+install.packages("plyr", repos="http://cran.xl-mirror.nl/") 
+}			
 require(plyr)
 
 test = read.table(inFile, sep="\t", header=TRUE)
@@ -26,12 +33,17 @@
 
 PRODF = PROD[ -1]
 
-#unique(PRODF[duplicated(PRODF),])
-#length(row.names(PRODF[duplicated(PRODF),]))
+#PRODF = unique(PRODF)
+PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ]
+
+
+uniqueCount = split(PRODF, f=PRODF[,"Sample"])
 
-#length(row.names(PRODF))
-PRODF = unique(PRODF)
-#length(row.names(PRODF))
+for(i in 1:length(uniqueCount)) {
+    dat = data.frame(uniqueCount[i])
+    sample = paste(unique(dat[,15]))
+    uniqueCount[sample] = length(dat[,1])
+}
 
 PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3))
 PRODFV$Length = as.numeric(PRODFV$Length)
@@ -73,28 +85,118 @@
 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE)
 close(tcJ)
 
+setwd(outDir)
+
 pV = ggplot(PRODFV)
 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage")
-#pV
+
+png("VPlot.png",width = 1280, height = 720)
+pV
+dev.off();
 
 pD = ggplot(PRODFD)
 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage")
-#pD
+
+png("DPlot.png",width = 800, height = 600)
+pD
+dev.off();
 
 pJ = ggplot(PRODFJ)
 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage")
-#pJ
+pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage")
+
+png("JPlot.png",width = 800, height = 600)
+pJ
+dev.off();
+
 
-#plotall = grid.arrange(pV, arrangeGrob(pD, pJ, ncol=2), ncol=1, widths=c(1,1.2))
-#plotall
-#ggsave(outFile, dpi=125)
+plotVD <- function(dat){
+	img = ggplot() + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
+	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
+	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	xlab("D genes") + 
+	ylab("V Genes")
+	
+	png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
+	print(img)
+	dev.off()
+}
+
+
+VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
+
+completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
+completeVD$Length = as.numeric(completeVD$Length)
+completeVD$log = log(completeVD$Length)
+completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
+#completeVD$log[is.na(completeVD$log)] = 0
+l = split(completeVD, f=completeVD[,"Sample"])
+
+lapply(l, FUN=plotVD)
+
 
 
-png(outFile,width = 1920, height = 1200)
-plotall = grid.arrange(pV, arrangeGrob(pD, pJ, ncol=2), ncol=1, widths=c(1,1.2))
-dev.off()
+plotVJ <- function(dat){
+	img = ggplot() + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
+	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
+	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	xlab("J genes") + 
+	ylab("V Genes")
+	
+	png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
+	print(img)
+	dev.off()
+}
+
+VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
+
+completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
+completeVJ$Length = as.numeric(completeVJ$Length)
+completeVJ$log = log(completeVJ$Length)
+completeVJ = merge(completeVJ, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
+#completeVJ$log[is.na(completeVJ$log)] = 0
+l = split(completeVJ, f=completeVJ[,"Sample"])
+lapply(l, FUN=plotVJ)
+
+plotDJ <- function(dat){
+	img = ggplot() + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 
+	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
+	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	xlab("J genes") + 
+	ylab("D Genes")
+	
+	png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
+	print(img)
+	dev.off()
+}
+
+DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
+
+completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
+completeDJ$Length = as.numeric(completeDJ$Length)
+completeDJ$log = log(completeDJ$Length)
+completeDJ = merge(completeDJ, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
+completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
+#completeDJ$log[is.na(completeDJ$log)] = 0
+l = split(completeDJ, f=completeDJ[,"Sample"])
+lapply(l, FUN=plotDJ)
 
 
+sampleFile <- file("samples.txt")
+un = unique(test$Sample)
+un = paste(un, sep="\n")
+writeLines(un, sampleFile)
+close(sampleFile)