comparison report_clonality/RScript.r @ 18:431797cd74c8 draft

Uploaded
author davidvanzessen
date Mon, 21 Dec 2015 07:23:37 -0500
parents ee1bda8c27c8
children 672d5e010b1f
comparison
equal deleted inserted replaced
17:ee1bda8c27c8 18:431797cd74c8
41 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD 41 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD
42 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) 42 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no)
43 clonality_method = args[8] 43 clonality_method = args[8]
44 44
45 # ---------------------- Data preperation ---------------------- 45 # ---------------------- Data preperation ----------------------
46
47 print("Report Clonality - Data preperation")
46 48
47 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") 49 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="")
48 50
49 setwd(outdir) 51 setwd(outdir)
50 52
112 writeLines(un, sampleFile) 114 writeLines(un, sampleFile)
113 close(sampleFile) 115 close(sampleFile)
114 116
115 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- 117 # ---------------------- Counting the productive/unproductive and unique sequences ----------------------
116 118
119 print("Report Clonality - counting productive/unproductive/unique")
120
117 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data 121 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data
118 inputdata$Functionality = "unproductive" 122 inputdata$Functionality = "unproductive"
119 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") 123 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND")
120 if(sum(search) > 0){ 124 if(sum(search) > 0){
121 inputdata[search,]$Functionality = "productive" 125 inputdata[search,]$Functionality = "productive"
175 179
176 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) 180 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
177 181
178 # ---------------------- Frequency calculation for V, D and J ---------------------- 182 # ---------------------- Frequency calculation for V, D and J ----------------------
179 183
184 print("Report Clonality - frequency calculation V, D and J")
185
180 PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")]) 186 PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")])
181 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) 187 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length)))
182 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) 188 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
183 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) 189 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total))
184 190
191 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) 197 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length)))
192 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) 198 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
193 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) 199 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total))
194 200
195 # ---------------------- Setting up the gene names for the different species/loci ---------------------- 201 # ---------------------- Setting up the gene names for the different species/loci ----------------------
202
203 print("Report Clonality - getting genes for species/loci")
196 204
197 Vchain = "" 205 Vchain = ""
198 Dchain = "" 206 Dchain = ""
199 Jchain = "" 207 Jchain = ""
200 208
231 useD = TRUE 239 useD = TRUE
232 if(nrow(Dchain) == 0){ 240 if(nrow(Dchain) == 0){
233 useD = FALSE 241 useD = FALSE
234 cat("No D Genes in this species/locus") 242 cat("No D Genes in this species/locus")
235 } 243 }
236 print(paste("useD:", useD)) 244 print(paste(nrow(Vchain), "genes in V"))
245 print(paste(nrow(Dchain), "genes in D"))
246 print(paste(nrow(Jchain), "genes in J"))
237 247
238 # ---------------------- merge with the frequency count ---------------------- 248 # ---------------------- merge with the frequency count ----------------------
239 249
240 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) 250 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE)
241 251
242 PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) 252 PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE)
243 253
244 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) 254 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE)
245 255
246 # ---------------------- Create the V, D and J frequency plots and write the data.frame for every plot to a file ---------------------- 256 # ---------------------- Create the V, D and J frequency plots and write the data.frame for every plot to a file ----------------------
257
258 print("Report Clonality - V, D and J frequency plots")
247 259
248 pV = ggplot(PRODFV) 260 pV = ggplot(PRODFV)
249 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 261 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
250 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") 262 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage")
251 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 263 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
282 png("JPlot.png",width = 800, height = 600) 294 png("JPlot.png",width = 800, height = 600)
283 pJ 295 pJ
284 dev.off(); 296 dev.off();
285 297
286 # ---------------------- Now the frequency plots of the V, D and J families ---------------------- 298 # ---------------------- Now the frequency plots of the V, D and J families ----------------------
299
300 print("Report Clonality - V, D and J family plots")
287 301
288 VGenes = PRODF[,c("Sample", "Top.V.Gene")] 302 VGenes = PRODF[,c("Sample", "Top.V.Gene")]
289 VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) 303 VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene)
290 VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")]) 304 VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")])
291 TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) 305 TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample])
332 dev.off(); 346 dev.off();
333 write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 347 write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
334 348
335 # ---------------------- Plotting the cdr3 length ---------------------- 349 # ---------------------- Plotting the cdr3 length ----------------------
336 350
351 print("Report Clonality - CDR3 length plot")
352
337 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")]) 353 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")])
338 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) 354 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample])
339 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") 355 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample")
340 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total 356 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total
341 CDR3LengthPlot = ggplot(CDR3Length) 357 CDR3LengthPlot = ggplot(CDR3Length)
348 dev.off() 364 dev.off()
349 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) 365 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T)
350 366
351 # ---------------------- Plot the heatmaps ---------------------- 367 # ---------------------- Plot the heatmaps ----------------------
352 368
353
354 #get the reverse order for the V and D genes 369 #get the reverse order for the V and D genes
355 revVchain = Vchain 370 revVchain = Vchain
356 revDchain = Dchain 371 revDchain = Dchain
357 revVchain$chr.orderV = rev(revVchain$chr.orderV) 372 revVchain$chr.orderV = rev(revVchain$chr.orderV)
358 revDchain$chr.orderD = rev(revDchain$chr.orderD) 373 revDchain$chr.orderD = rev(revDchain$chr.orderD)
359 374
360 if(useD){ 375 if(useD){
376 print("Report Clonality - Heatmaps VD")
361 plotVD <- function(dat){ 377 plotVD <- function(dat){
362 if(length(dat[,1]) == 0){ 378 if(length(dat[,1]) == 0){
363 return() 379 return()
364 } 380 }
365 img = ggplot() + 381 img = ggplot() +
387 403
388 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) 404 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
389 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) 405 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
390 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) 406 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
391 VDList = split(completeVD, f=completeVD[,"Sample"]) 407 VDList = split(completeVD, f=completeVD[,"Sample"])
392
393 lapply(VDList, FUN=plotVD) 408 lapply(VDList, FUN=plotVD)
394 } 409 }
410
411 print("Report Clonality - Heatmaps VJ")
395 412
396 plotVJ <- function(dat){ 413 plotVJ <- function(dat){
397 if(length(dat[,1]) == 0){ 414 if(length(dat[,1]) == 0){
398 return() 415 return()
399 } 416 }
425 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) 442 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
426 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) 443 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
427 VJList = split(completeVJ, f=completeVJ[,"Sample"]) 444 VJList = split(completeVJ, f=completeVJ[,"Sample"])
428 lapply(VJList, FUN=plotVJ) 445 lapply(VJList, FUN=plotVJ)
429 446
447
448
430 if(useD){ 449 if(useD){
450 print("Report Clonality - Heatmaps DJ")
431 plotDJ <- function(dat){ 451 plotDJ <- function(dat){
432 if(length(dat[,1]) == 0){ 452 if(length(dat[,1]) == 0){
433 return() 453 return()
434 } 454 }
435 img = ggplot() + 455 img = ggplot() +