diff RScript.r @ 35:bd1116ba4ee1 draft

Uploaded
author davidvanzessen
date Fri, 08 Nov 2013 05:46:21 -0500
parents 3ec932d17ef3
children b3c97b26db60
line wrap: on
line diff
--- a/RScript.r	Fri Nov 08 05:44:12 2013 -0500
+++ b/RScript.r	Fri Nov 08 05:46:21 2013 -0500
@@ -7,24 +7,18 @@
 outDir = args[3]
 
 if (!require("gridExtra")) {
-	install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
+install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 
 }
 library(gridExtra)
 if (!require("ggplot2")) {
-	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
 }
 require(ggplot2)
 if (!require("plyr")) {
-	install.packages("plyr", repos="http://cran.xl-mirror.nl/") 
+install.packages("plyr", repos="http://cran.xl-mirror.nl/") 
 }			
 require(plyr)
 
-if (!("data.table" %in% rownames(installed.packages()))) {
-	install.packages("data.table", repos="http://cran.xl-mirror.nl/") 
-}
-library(data.table)
-
-
 test = read.table(inFile, sep="\t", header=TRUE)
 
 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
@@ -37,28 +31,35 @@
 
 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ]
 
-#PRODF = PROD[ -1]
-
-PRODF = PROD
+PRODF = PROD[ -1]
 
 #PRODF = unique(PRODF)
 PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ]
 
-PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")])
+
+uniqueCount = split(PRODF, f=PRODF[,"Sample"])
+
+for(i in 1:length(uniqueCount)) {
+    dat = data.frame(uniqueCount[i])
+    sample = paste(unique(dat[,15]))
+    uniqueCount[sample] = length(dat[,1])
+}
+
+PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3))
 PRODFV$Length = as.numeric(PRODFV$Length)
 Total = 0
 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length)))
 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total))
 
-PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")])
+PRODFD = ddply(PRODF, c("Sample", "Top.D.Gene"), function(x) summary(x$VDJCDR3))
 PRODFD$Length = as.numeric(PRODFD$Length)
 Total = 0
 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length)))
 PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
 PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total))
 
-PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")])
+PRODFJ = ddply(PRODF, c("Sample", "Top.J.Gene"), function(x) summary(x$VDJCDR3))
 PRODFJ$Length = as.numeric(PRODFJ$Length)
 Total = 0
 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length)))
@@ -115,8 +116,8 @@
 	img = ggplot() + 
 	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
-	scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0, maxVD)) + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
 	xlab("D genes") + 
 	ylab("V Genes")
 	
@@ -125,14 +126,13 @@
 	dev.off()
 }
 
-VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
 
+VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3))
 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
 
 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
 completeVD$Length = as.numeric(completeVD$Length)
 completeVD$log = log(completeVD$Length)
-maxVD = max(subset(completeVD, !is.na(completeVD$log), "log"))
 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
 #completeVD$log[is.na(completeVD$log)] = 0
@@ -147,7 +147,7 @@
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("V Genes")
 	
@@ -156,8 +156,7 @@
 	dev.off()
 }
 
-VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
-#VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
@@ -174,7 +173,7 @@
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("D Genes")
 	
@@ -183,8 +182,7 @@
 	dev.off()
 }
 
-DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
-#DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)