Mercurial > repos > davidvanzessen > plotting_merged
diff RScript.r @ 36:b3c97b26db60 draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 08 Nov 2013 06:51:37 -0500 |
parents | bd1116ba4ee1 |
children | 16fe6233f90e |
line wrap: on
line diff
--- a/RScript.r Fri Nov 08 05:46:21 2013 -0500 +++ b/RScript.r Fri Nov 08 06:51:37 2013 -0500 @@ -6,19 +6,25 @@ outFile = args[2] outDir = args[3] -if (!require("gridExtra")) { -install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") +if (!("gridExtra" %in% rownames(installed.packages()))) { + install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") } library(gridExtra) -if (!require("ggplot2")) { -install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") } require(ggplot2) -if (!require("plyr")) { -install.packages("plyr", repos="http://cran.xl-mirror.nl/") +if (!("plyr" %in% rownames(installed.packages()))) { + install.packages("plyr", repos="http://cran.xl-mirror.nl/") } require(plyr) +if (!("data.table" %in% rownames(installed.packages()))) { + install.packages("data.table", repos="http://cran.xl-mirror.nl/") +} +library(data.table) + + test = read.table(inFile, sep="\t", header=TRUE) test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) @@ -31,35 +37,28 @@ NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] -PRODF = PROD[ -1] +#PRODF = PROD[ -1] + +PRODF = PROD #PRODF = unique(PRODF) PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] - -uniqueCount = split(PRODF, f=PRODF[,"Sample"]) - -for(i in 1:length(uniqueCount)) { - dat = data.frame(uniqueCount[i]) - sample = paste(unique(dat[,15])) - uniqueCount[sample] = length(dat[,1]) -} - -PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3)) +PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")]) PRODFV$Length = as.numeric(PRODFV$Length) Total = 0 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) -PRODFD = ddply(PRODF, c("Sample", "Top.D.Gene"), function(x) summary(x$VDJCDR3)) +PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")]) PRODFD$Length = as.numeric(PRODFD$Length) Total = 0 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) -PRODFJ = ddply(PRODF, c("Sample", "Top.J.Gene"), function(x) summary(x$VDJCDR3)) +PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")]) PRODFJ$Length = as.numeric(PRODFJ$Length) Total = 0 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) @@ -116,8 +115,8 @@ img = ggplot() + geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - scale_fill_gradient(low="gold", high="blue", na.value="white") + - ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + + scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0, maxVD)) + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + xlab("D genes") + ylab("V Genes") @@ -126,13 +125,17 @@ dev.off() } +VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) -VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3)) + + + cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample)) completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) completeVD$Length = as.numeric(completeVD$Length) completeVD$log = log(completeVD$Length) +maxVD = max(subset(completeVD, !is.na(completeVD$log), "log")) completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) #completeVD$log[is.na(completeVD$log)] = 0 @@ -147,7 +150,7 @@ geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_gradient(low="gold", high="blue", na.value="white") + - ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + xlab("J genes") + ylab("V Genes") @@ -156,7 +159,8 @@ dev.off() } -VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) +VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) +#VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) @@ -173,7 +177,7 @@ geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_gradient(low="gold", high="blue", na.value="white") + - ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + xlab("J genes") + ylab("D Genes") @@ -182,7 +186,8 @@ dev.off() } -DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) +DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) +#DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)