comparison RScript.r @ 38:6e490e056fc4 draft

Uploaded
author davidvanzessen
date Thu, 14 Nov 2013 06:57:35 -0500
parents 16fe6233f90e
children a9ad03d52680
comparison
equal deleted inserted replaced
37:16fe6233f90e 38:6e490e056fc4
23 install.packages("data.table", repos="http://cran.xl-mirror.nl/") 23 install.packages("data.table", repos="http://cran.xl-mirror.nl/")
24 } 24 }
25 library(data.table) 25 library(data.table)
26 26
27 27
28 test = read.table(inFile, sep="\t", header=TRUE) 28 test = read.table(inFile, sep="\t", header=TRUE, fill=T)
29 29
30 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) 30 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
31 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) 31 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene)
32 test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene) 32 test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene)
33 33
108 108
109 png("JPlot.png",width = 800, height = 600) 109 png("JPlot.png",width = 800, height = 600)
110 pJ 110 pJ
111 dev.off(); 111 dev.off();
112 112
113 revVchain = Vchain
114 revDchain = Dchain
115 revVchain$chr.orderV = rev(revVchain$chr.orderV)
116 revDchain$chr.orderD = rev(revDchain$chr.orderD)
113 117
114 plotVD <- function(dat){ 118 plotVD <- function(dat){
115 img = ggplot() + 119 img = ggplot() +
116 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 120 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
117 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 121 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
118 scale_fill_gradient(low="gold", high="blue", na.value="white") + 122 scale_fill_gradient(low="gold", high="blue", na.value="white") +
119 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 123 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
120 xlab("D genes") + 124 xlab("D genes") +
121 ylab("V Genes") 125 ylab("V Genes")
122 126
123 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) 127 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
124 print(img) 128 print(img)
125 dev.off() 129 dev.off()
126 } 130 }
127 131
128 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) 132 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
133
134 VandDCount$l = log(VandDCount$Length)
135 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")])
136 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
137 VandDCount$relLength = VandDCount$l / VandDCount$max
138
129 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample)) 139 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
130 140
131 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) 141 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
132 completeVD$Length = as.numeric(completeVD$Length) 142 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
133 completeVD$log = log(completeVD$Length)
134 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
135 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) 143 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
136 #completeVD$log[is.na(completeVD$log)] = 0 144 VDList = split(completeVD, f=completeVD[,"Sample"])
137 l = split(completeVD, f=completeVD[,"Sample"]) 145
138 146 lapply(VDList, FUN=plotVD)
139 lapply(l, FUN=plotVD)
140 147
141 148
142 149
143 plotVJ <- function(dat){ 150 plotVJ <- function(dat){
144 img = ggplot() + 151 img = ggplot() +
145 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) + 152 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
146 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 153 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
147 scale_fill_gradient(low="gold", high="blue", na.value="white") + 154 scale_fill_gradient(low="gold", high="blue", na.value="white") +
148 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 155 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
149 xlab("J genes") + 156 xlab("J genes") +
150 ylab("V Genes") 157 ylab("V Genes")
151 158
152 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) 159 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
153 print(img) 160 print(img)
154 dev.off() 161 dev.off()
155 } 162 }
156 163
157 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) 164 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
158 #VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) 165
166 VandJCount$l = log(VandJCount$Length)
167 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")])
168 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
169 VandJCount$relLength = VandJCount$l / VandJCount$max
170
159 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) 171 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
160 172
161 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) 173 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
162 completeVJ$Length = as.numeric(completeVJ$Length) 174 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
163 completeVJ$log = log(completeVJ$Length)
164 completeVJ = merge(completeVJ, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
165 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) 175 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
166 #completeVJ$log[is.na(completeVJ$log)] = 0 176 VJList = split(completeVJ, f=completeVJ[,"Sample"])
167 l = split(completeVJ, f=completeVJ[,"Sample"]) 177 lapply(VJList, FUN=plotVJ)
168 lapply(l, FUN=plotVJ)
169 178
170 plotDJ <- function(dat){ 179 plotDJ <- function(dat){
171 img = ggplot() + 180 img = ggplot() +
172 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) + 181 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) +
173 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 182 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
174 scale_fill_gradient(low="gold", high="blue", na.value="white") + 183 scale_fill_gradient(low="gold", high="blue", na.value="white") +
175 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat[!is.na(dat[,4]),][4]) ,")", sep="")) + 184 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
176 xlab("J genes") + 185 xlab("J genes") +
177 ylab("D Genes") 186 ylab("D Genes")
178 187
179 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) 188 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
180 print(img) 189 print(img)
181 dev.off() 190 dev.off()
182 } 191 }
183 192
184 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) 193 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
185 #DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3)) 194
195 DandJCount$l = log(DandJCount$Length)
196 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")])
197 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
198 DandJCount$relLength = DandJCount$l / DandJCount$max
199
186 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) 200 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
187 201
188 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) 202 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
189 completeDJ$Length = as.numeric(completeDJ$Length) 203 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
190 completeDJ$log = log(completeDJ$Length)
191 completeDJ = merge(completeDJ, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
192 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) 204 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
193 #completeDJ$log[is.na(completeDJ$log)] = 0 205 DJList = split(completeDJ, f=completeDJ[,"Sample"])
194 l = split(completeDJ, f=completeDJ[,"Sample"]) 206 lapply(DJList, FUN=plotDJ)
195 lapply(l, FUN=plotDJ)
196 207
197 208
198 sampleFile <- file("samples.txt") 209 sampleFile <- file("samples.txt")
199 un = unique(test$Sample) 210 un = unique(test$Sample)
200 un = paste(un, sep="\n") 211 un = paste(un, sep="\n")