diff RScript.r @ 36:b3c97b26db60 draft

Uploaded
author davidvanzessen
date Fri, 08 Nov 2013 06:51:37 -0500
parents bd1116ba4ee1
children 16fe6233f90e
line wrap: on
line diff
--- a/RScript.r	Fri Nov 08 05:46:21 2013 -0500
+++ b/RScript.r	Fri Nov 08 06:51:37 2013 -0500
@@ -6,19 +6,25 @@
 outFile = args[2]
 outDir = args[3]
 
-if (!require("gridExtra")) {
-install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
+if (!("gridExtra" %in% rownames(installed.packages()))) {
+	install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
 }
 library(gridExtra)
-if (!require("ggplot2")) {
-install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+if (!("ggplot2" %in% rownames(installed.packages()))) {
+	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
 }
 require(ggplot2)
-if (!require("plyr")) {
-install.packages("plyr", repos="http://cran.xl-mirror.nl/") 
+if (!("plyr" %in% rownames(installed.packages()))) {
+	install.packages("plyr", repos="http://cran.xl-mirror.nl/") 
 }			
 require(plyr)
 
+if (!("data.table" %in% rownames(installed.packages()))) {
+	install.packages("data.table", repos="http://cran.xl-mirror.nl/") 
+}
+library(data.table)
+
+
 test = read.table(inFile, sep="\t", header=TRUE)
 
 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
@@ -31,35 +37,28 @@
 
 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ]
 
-PRODF = PROD[ -1]
+#PRODF = PROD[ -1]
+
+PRODF = PROD
 
 #PRODF = unique(PRODF)
 PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ]
 
-
-uniqueCount = split(PRODF, f=PRODF[,"Sample"])
-
-for(i in 1:length(uniqueCount)) {
-    dat = data.frame(uniqueCount[i])
-    sample = paste(unique(dat[,15]))
-    uniqueCount[sample] = length(dat[,1])
-}
-
-PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3))
+PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")])
 PRODFV$Length = as.numeric(PRODFV$Length)
 Total = 0
 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length)))
 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total))
 
-PRODFD = ddply(PRODF, c("Sample", "Top.D.Gene"), function(x) summary(x$VDJCDR3))
+PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")])
 PRODFD$Length = as.numeric(PRODFD$Length)
 Total = 0
 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length)))
 PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
 PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total))
 
-PRODFJ = ddply(PRODF, c("Sample", "Top.J.Gene"), function(x) summary(x$VDJCDR3))
+PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")])
 PRODFJ$Length = as.numeric(PRODFJ$Length)
 Total = 0
 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length)))
@@ -116,8 +115,8 @@
 	img = ggplot() + 
 	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
-	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0, maxVD)) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
 	xlab("D genes") + 
 	ylab("V Genes")
 	
@@ -126,13 +125,17 @@
 	dev.off()
 }
 
+VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
 
-VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+
+
+
 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
 
 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
 completeVD$Length = as.numeric(completeVD$Length)
 completeVD$log = log(completeVD$Length)
+maxVD = max(subset(completeVD, !is.na(completeVD$log), "log"))
 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
 #completeVD$log[is.na(completeVD$log)] = 0
@@ -147,7 +150,7 @@
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("V Genes")
 	
@@ -156,7 +159,8 @@
 	dev.off()
 }
 
-VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
+#VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
@@ -173,7 +177,7 @@
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("D Genes")
 	
@@ -182,7 +186,8 @@
 	dev.off()
 }
 
-DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
+#DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)