changeset 38:6e490e056fc4 draft

Uploaded
author davidvanzessen
date Thu, 14 Nov 2013 06:57:35 -0500
parents 16fe6233f90e
children a9ad03d52680
files RScript.r
diffstat 1 files changed, 38 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/RScript.r	Fri Nov 08 06:54:21 2013 -0500
+++ b/RScript.r	Thu Nov 14 06:57:35 2013 -0500
@@ -25,7 +25,7 @@
 library(data.table)
 
 
-test = read.table(inFile, sep="\t", header=TRUE)
+test = read.table(inFile, sep="\t", header=TRUE, fill=T)
 
 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene)
@@ -110,13 +110,17 @@
 pJ
 dev.off();
 
+revVchain = Vchain
+revDchain = Dchain
+revVchain$chr.orderV = rev(revVchain$chr.orderV)
+revDchain$chr.orderD = rev(revDchain$chr.orderD)
 
 plotVD <- function(dat){
 	img = ggplot() + 
-	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 
 	xlab("D genes") + 
 	ylab("V Genes")
 	
@@ -126,26 +130,29 @@
 }
 
 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
+
+VandDCount$l = log(VandDCount$Length)
+maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")])
+VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
+VandDCount$relLength = VandDCount$l / VandDCount$max
+
 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
 
 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
-completeVD$Length = as.numeric(completeVD$Length)
-completeVD$log = log(completeVD$Length)
-completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
-#completeVD$log[is.na(completeVD$log)] = 0
-l = split(completeVD, f=completeVD[,"Sample"])
+VDList = split(completeVD, f=completeVD[,"Sample"])
 
-lapply(l, FUN=plotVD)
+lapply(VDList, FUN=plotVD)
 
 
 
 plotVJ <- function(dat){
 	img = ggplot() + 
-	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("V Genes")
 	
@@ -155,24 +162,26 @@
 }
 
 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
-#VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+
+VandJCount$l = log(VandJCount$Length)
+maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")])
+VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
+VandJCount$relLength = VandJCount$l / VandJCount$max
+
 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
-completeVJ$Length = as.numeric(completeVJ$Length)
-completeVJ$log = log(completeVJ$Length)
-completeVJ = merge(completeVJ, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
-#completeVJ$log[is.na(completeVJ$log)] = 0
-l = split(completeVJ, f=completeVJ[,"Sample"])
-lapply(l, FUN=plotVJ)
+VJList = split(completeVJ, f=completeVJ[,"Sample"])
+lapply(VJList, FUN=plotVJ)
 
 plotDJ <- function(dat){
 	img = ggplot() + 
-	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 
+	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + 
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 	scale_fill_gradient(low="gold", high="blue", na.value="white") + 
-	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 
+	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 
 	xlab("J genes") + 
 	ylab("D Genes")
 	
@@ -182,17 +191,19 @@
 }
 
 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
-#DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
+
+DandJCount$l = log(DandJCount$Length)
+maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")])
+DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
+DandJCount$relLength = DandJCount$l / DandJCount$max
+
 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
 
 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
-completeDJ$Length = as.numeric(completeDJ$Length)
-completeDJ$log = log(completeDJ$Length)
-completeDJ = merge(completeDJ, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
+completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
-#completeDJ$log[is.na(completeDJ$log)] = 0
-l = split(completeDJ, f=completeDJ[,"Sample"])
-lapply(l, FUN=plotDJ)
+DJList = split(completeDJ, f=completeDJ[,"Sample"])
+lapply(DJList, FUN=plotDJ)
 
 
 sampleFile <- file("samples.txt")