Mercurial > repos > davidvanzessen > argalaxy_tools
comparison report_clonality/RScript.r @ 18:431797cd74c8 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 21 Dec 2015 07:23:37 -0500 |
| parents | ee1bda8c27c8 |
| children | 672d5e010b1f |
comparison
equal
deleted
inserted
replaced
| 17:ee1bda8c27c8 | 18:431797cd74c8 |
|---|---|
| 41 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD | 41 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD |
| 42 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) | 42 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) |
| 43 clonality_method = args[8] | 43 clonality_method = args[8] |
| 44 | 44 |
| 45 # ---------------------- Data preperation ---------------------- | 45 # ---------------------- Data preperation ---------------------- |
| 46 | |
| 47 print("Report Clonality - Data preperation") | |
| 46 | 48 |
| 47 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") | 49 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") |
| 48 | 50 |
| 49 setwd(outdir) | 51 setwd(outdir) |
| 50 | 52 |
| 112 writeLines(un, sampleFile) | 114 writeLines(un, sampleFile) |
| 113 close(sampleFile) | 115 close(sampleFile) |
| 114 | 116 |
| 115 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 117 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- |
| 116 | 118 |
| 119 print("Report Clonality - counting productive/unproductive/unique") | |
| 120 | |
| 117 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data | 121 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data |
| 118 inputdata$Functionality = "unproductive" | 122 inputdata$Functionality = "unproductive" |
| 119 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") | 123 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") |
| 120 if(sum(search) > 0){ | 124 if(sum(search) > 0){ |
| 121 inputdata[search,]$Functionality = "productive" | 125 inputdata[search,]$Functionality = "productive" |
| 175 | 179 |
| 176 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) | 180 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) |
| 177 | 181 |
| 178 # ---------------------- Frequency calculation for V, D and J ---------------------- | 182 # ---------------------- Frequency calculation for V, D and J ---------------------- |
| 179 | 183 |
| 184 print("Report Clonality - frequency calculation V, D and J") | |
| 185 | |
| 180 PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")]) | 186 PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")]) |
| 181 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) | 187 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) |
| 182 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) | 188 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) |
| 183 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) | 189 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) |
| 184 | 190 |
| 191 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) | 197 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) |
| 192 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) | 198 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) |
| 193 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) | 199 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) |
| 194 | 200 |
| 195 # ---------------------- Setting up the gene names for the different species/loci ---------------------- | 201 # ---------------------- Setting up the gene names for the different species/loci ---------------------- |
| 202 | |
| 203 print("Report Clonality - getting genes for species/loci") | |
| 196 | 204 |
| 197 Vchain = "" | 205 Vchain = "" |
| 198 Dchain = "" | 206 Dchain = "" |
| 199 Jchain = "" | 207 Jchain = "" |
| 200 | 208 |
| 231 useD = TRUE | 239 useD = TRUE |
| 232 if(nrow(Dchain) == 0){ | 240 if(nrow(Dchain) == 0){ |
| 233 useD = FALSE | 241 useD = FALSE |
| 234 cat("No D Genes in this species/locus") | 242 cat("No D Genes in this species/locus") |
| 235 } | 243 } |
| 236 print(paste("useD:", useD)) | 244 print(paste(nrow(Vchain), "genes in V")) |
| 245 print(paste(nrow(Dchain), "genes in D")) | |
| 246 print(paste(nrow(Jchain), "genes in J")) | |
| 237 | 247 |
| 238 # ---------------------- merge with the frequency count ---------------------- | 248 # ---------------------- merge with the frequency count ---------------------- |
| 239 | 249 |
| 240 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) | 250 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) |
| 241 | 251 |
| 242 PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) | 252 PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) |
| 243 | 253 |
| 244 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) | 254 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) |
| 245 | 255 |
| 246 # ---------------------- Create the V, D and J frequency plots and write the data.frame for every plot to a file ---------------------- | 256 # ---------------------- Create the V, D and J frequency plots and write the data.frame for every plot to a file ---------------------- |
| 257 | |
| 258 print("Report Clonality - V, D and J frequency plots") | |
| 247 | 259 |
| 248 pV = ggplot(PRODFV) | 260 pV = ggplot(PRODFV) |
| 249 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 261 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
| 250 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") | 262 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") |
| 251 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 263 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 282 png("JPlot.png",width = 800, height = 600) | 294 png("JPlot.png",width = 800, height = 600) |
| 283 pJ | 295 pJ |
| 284 dev.off(); | 296 dev.off(); |
| 285 | 297 |
| 286 # ---------------------- Now the frequency plots of the V, D and J families ---------------------- | 298 # ---------------------- Now the frequency plots of the V, D and J families ---------------------- |
| 299 | |
| 300 print("Report Clonality - V, D and J family plots") | |
| 287 | 301 |
| 288 VGenes = PRODF[,c("Sample", "Top.V.Gene")] | 302 VGenes = PRODF[,c("Sample", "Top.V.Gene")] |
| 289 VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) | 303 VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) |
| 290 VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")]) | 304 VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")]) |
| 291 TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) | 305 TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) |
| 332 dev.off(); | 346 dev.off(); |
| 333 write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 347 write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 334 | 348 |
| 335 # ---------------------- Plotting the cdr3 length ---------------------- | 349 # ---------------------- Plotting the cdr3 length ---------------------- |
| 336 | 350 |
| 351 print("Report Clonality - CDR3 length plot") | |
| 352 | |
| 337 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")]) | 353 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")]) |
| 338 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) | 354 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) |
| 339 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") | 355 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") |
| 340 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total | 356 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total |
| 341 CDR3LengthPlot = ggplot(CDR3Length) | 357 CDR3LengthPlot = ggplot(CDR3Length) |
| 348 dev.off() | 364 dev.off() |
| 349 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) | 365 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 350 | 366 |
| 351 # ---------------------- Plot the heatmaps ---------------------- | 367 # ---------------------- Plot the heatmaps ---------------------- |
| 352 | 368 |
| 353 | |
| 354 #get the reverse order for the V and D genes | 369 #get the reverse order for the V and D genes |
| 355 revVchain = Vchain | 370 revVchain = Vchain |
| 356 revDchain = Dchain | 371 revDchain = Dchain |
| 357 revVchain$chr.orderV = rev(revVchain$chr.orderV) | 372 revVchain$chr.orderV = rev(revVchain$chr.orderV) |
| 358 revDchain$chr.orderD = rev(revDchain$chr.orderD) | 373 revDchain$chr.orderD = rev(revDchain$chr.orderD) |
| 359 | 374 |
| 360 if(useD){ | 375 if(useD){ |
| 376 print("Report Clonality - Heatmaps VD") | |
| 361 plotVD <- function(dat){ | 377 plotVD <- function(dat){ |
| 362 if(length(dat[,1]) == 0){ | 378 if(length(dat[,1]) == 0){ |
| 363 return() | 379 return() |
| 364 } | 380 } |
| 365 img = ggplot() + | 381 img = ggplot() + |
| 387 | 403 |
| 388 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) | 404 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) |
| 389 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 405 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
| 390 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 406 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) |
| 391 VDList = split(completeVD, f=completeVD[,"Sample"]) | 407 VDList = split(completeVD, f=completeVD[,"Sample"]) |
| 392 | |
| 393 lapply(VDList, FUN=plotVD) | 408 lapply(VDList, FUN=plotVD) |
| 394 } | 409 } |
| 410 | |
| 411 print("Report Clonality - Heatmaps VJ") | |
| 395 | 412 |
| 396 plotVJ <- function(dat){ | 413 plotVJ <- function(dat){ |
| 397 if(length(dat[,1]) == 0){ | 414 if(length(dat[,1]) == 0){ |
| 398 return() | 415 return() |
| 399 } | 416 } |
| 425 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 442 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
| 426 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 443 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) |
| 427 VJList = split(completeVJ, f=completeVJ[,"Sample"]) | 444 VJList = split(completeVJ, f=completeVJ[,"Sample"]) |
| 428 lapply(VJList, FUN=plotVJ) | 445 lapply(VJList, FUN=plotVJ) |
| 429 | 446 |
| 447 | |
| 448 | |
| 430 if(useD){ | 449 if(useD){ |
| 450 print("Report Clonality - Heatmaps DJ") | |
| 431 plotDJ <- function(dat){ | 451 plotDJ <- function(dat){ |
| 432 if(length(dat[,1]) == 0){ | 452 if(length(dat[,1]) == 0){ |
| 433 return() | 453 return() |
| 434 } | 454 } |
| 435 img = ggplot() + | 455 img = ggplot() + |
