comparison RScript.r @ 36:b3c97b26db60 draft

Uploaded
author davidvanzessen
date Fri, 08 Nov 2013 06:51:37 -0500
parents bd1116ba4ee1
children 16fe6233f90e
comparison
equal deleted inserted replaced
35:bd1116ba4ee1 36:b3c97b26db60
4 4
5 inFile = args[1] 5 inFile = args[1]
6 outFile = args[2] 6 outFile = args[2]
7 outDir = args[3] 7 outDir = args[3]
8 8
9 if (!require("gridExtra")) { 9 if (!("gridExtra" %in% rownames(installed.packages()))) {
10 install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 10 install.packages("gridExtra", repos="http://cran.xl-mirror.nl/")
11 } 11 }
12 library(gridExtra) 12 library(gridExtra)
13 if (!require("ggplot2")) { 13 if (!("ggplot2" %in% rownames(installed.packages()))) {
14 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 14 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/")
15 } 15 }
16 require(ggplot2) 16 require(ggplot2)
17 if (!require("plyr")) { 17 if (!("plyr" %in% rownames(installed.packages()))) {
18 install.packages("plyr", repos="http://cran.xl-mirror.nl/") 18 install.packages("plyr", repos="http://cran.xl-mirror.nl/")
19 } 19 }
20 require(plyr) 20 require(plyr)
21
22 if (!("data.table" %in% rownames(installed.packages()))) {
23 install.packages("data.table", repos="http://cran.xl-mirror.nl/")
24 }
25 library(data.table)
26
21 27
22 test = read.table(inFile, sep="\t", header=TRUE) 28 test = read.table(inFile, sep="\t", header=TRUE)
23 29
24 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) 30 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
25 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) 31 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene)
29 35
30 PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ] 36 PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ]
31 37
32 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] 38 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ]
33 39
34 PRODF = PROD[ -1] 40 #PRODF = PROD[ -1]
41
42 PRODF = PROD
35 43
36 #PRODF = unique(PRODF) 44 #PRODF = unique(PRODF)
37 PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] 45 PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ]
38 46
39 47 PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")])
40 uniqueCount = split(PRODF, f=PRODF[,"Sample"])
41
42 for(i in 1:length(uniqueCount)) {
43 dat = data.frame(uniqueCount[i])
44 sample = paste(unique(dat[,15]))
45 uniqueCount[sample] = length(dat[,1])
46 }
47
48 PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3))
49 PRODFV$Length = as.numeric(PRODFV$Length) 48 PRODFV$Length = as.numeric(PRODFV$Length)
50 Total = 0 49 Total = 0
51 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) 50 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length)))
52 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) 51 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
53 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) 52 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total))
54 53
55 PRODFD = ddply(PRODF, c("Sample", "Top.D.Gene"), function(x) summary(x$VDJCDR3)) 54 PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")])
56 PRODFD$Length = as.numeric(PRODFD$Length) 55 PRODFD$Length = as.numeric(PRODFD$Length)
57 Total = 0 56 Total = 0
58 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) 57 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length)))
59 PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) 58 PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
60 PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) 59 PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total))
61 60
62 PRODFJ = ddply(PRODF, c("Sample", "Top.J.Gene"), function(x) summary(x$VDJCDR3)) 61 PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")])
63 PRODFJ$Length = as.numeric(PRODFJ$Length) 62 PRODFJ$Length = as.numeric(PRODFJ$Length)
64 Total = 0 63 Total = 0
65 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) 64 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length)))
66 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) 65 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
67 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) 66 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total))
114 113
115 plotVD <- function(dat){ 114 plotVD <- function(dat){
116 img = ggplot() + 115 img = ggplot() +
117 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 116 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) +
118 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 117 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
119 scale_fill_gradient(low="gold", high="blue", na.value="white") + 118 scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0, maxVD)) +
120 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 119 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) +
121 xlab("D genes") + 120 xlab("D genes") +
122 ylab("V Genes") 121 ylab("V Genes")
123 122
124 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) 123 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
125 print(img) 124 print(img)
126 dev.off() 125 dev.off()
127 } 126 }
128 127
129 128 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
130 VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3)) 129
130
131
132
131 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample)) 133 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
132 134
133 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) 135 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
134 completeVD$Length = as.numeric(completeVD$Length) 136 completeVD$Length = as.numeric(completeVD$Length)
135 completeVD$log = log(completeVD$Length) 137 completeVD$log = log(completeVD$Length)
138 maxVD = max(subset(completeVD, !is.na(completeVD$log), "log"))
136 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) 139 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
137 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) 140 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
138 #completeVD$log[is.na(completeVD$log)] = 0 141 #completeVD$log[is.na(completeVD$log)] = 0
139 l = split(completeVD, f=completeVD[,"Sample"]) 142 l = split(completeVD, f=completeVD[,"Sample"])
140 143
145 plotVJ <- function(dat){ 148 plotVJ <- function(dat){
146 img = ggplot() + 149 img = ggplot() +
147 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 150 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) +
148 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 151 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
149 scale_fill_gradient(low="gold", high="blue", na.value="white") + 152 scale_fill_gradient(low="gold", high="blue", na.value="white") +
150 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 153 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) +
151 xlab("J genes") + 154 xlab("J genes") +
152 ylab("V Genes") 155 ylab("V Genes")
153 156
154 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) 157 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
155 print(img) 158 print(img)
156 dev.off() 159 dev.off()
157 } 160 }
158 161
159 VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) 162 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
163 #VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
160 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) 164 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
161 165
162 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) 166 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
163 completeVJ$Length = as.numeric(completeVJ$Length) 167 completeVJ$Length = as.numeric(completeVJ$Length)
164 completeVJ$log = log(completeVJ$Length) 168 completeVJ$log = log(completeVJ$Length)
171 plotDJ <- function(dat){ 175 plotDJ <- function(dat){
172 img = ggplot() + 176 img = ggplot() +
173 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 177 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) +
174 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 178 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
175 scale_fill_gradient(low="gold", high="blue", na.value="white") + 179 scale_fill_gradient(low="gold", high="blue", na.value="white") +
176 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) + 180 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) +
177 xlab("J genes") + 181 xlab("J genes") +
178 ylab("D Genes") 182 ylab("D Genes")
179 183
180 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) 184 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
181 print(img) 185 print(img)
182 dev.off() 186 dev.off()
183 } 187 }
184 188
185 DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) 189 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
190 #DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
186 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) 191 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
187 192
188 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) 193 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
189 completeDJ$Length = as.numeric(completeDJ$Length) 194 completeDJ$Length = as.numeric(completeDJ$Length)
190 completeDJ$log = log(completeDJ$Length) 195 completeDJ$log = log(completeDJ$Length)