comparison mutation_analysis.r @ 118:ad7ca9c2b748 draft

Uploaded
author davidvanzessen
date Thu, 11 Aug 2016 08:00:00 -0400
parents ede6c4ee5196
children 626a956f3811
comparison
equal deleted inserted replaced
117:a8f91c52411c 118:ad7ca9c2b748
103 "FR3.IMGT.Nb.of.silent.mutations", 103 "FR3.IMGT.Nb.of.silent.mutations",
104 "FR1.IMGT.Nb.of.nonsilent.mutations", 104 "FR1.IMGT.Nb.of.nonsilent.mutations",
105 "FR2.IMGT.Nb.of.nonsilent.mutations", 105 "FR2.IMGT.Nb.of.nonsilent.mutations",
106 "FR3.IMGT.Nb.of.nonsilent.mutations") 106 "FR3.IMGT.Nb.of.nonsilent.mutations")
107 107
108
109 print("Cleaning up columns")
108 for(col in cleanup_columns){ 110 for(col in cleanup_columns){
109 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) 111 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
110 #dat[dat[,col] == "",] = "0" 112 #dat[dat[,col] == "",] = "0"
111 dat[,col] = as.numeric(dat[,col]) 113 dat[,col] = as.numeric(dat[,col])
112 dat[is.na(dat[,col]),col] = 0 114 dat[is.na(dat[,col]),col] = 0
116 if(!include_fr1){ 118 if(!include_fr1){
117 regions = c("CDR1", "FR2", "CDR2", "FR3") 119 regions = c("CDR1", "FR2", "CDR2", "FR3")
118 } 120 }
119 121
120 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } 122 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
123
124 print("aggregating data into new columns")
121 125
122 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") 126 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")
123 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) 127 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)
124 128
125 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") 129 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")
302 zeros=rep(0, 4) 306 zeros=rep(0, 4)
303 307
304 funcs = c(median, sum, mean) 308 funcs = c(median, sum, mean)
305 fnames = c("median", "sum", "mean") 309 fnames = c("median", "sum", "mean")
306 310
311 print("Creating result tables")
312
307 for(i in 1:length(funcs)){ 313 for(i in 1:length(funcs)){
308 func = funcs[[i]] 314 func = funcs[[i]]
309 fname = fnames[[i]] 315 fname = fnames[[i]]
310 316
311 rows = 9 317 rows = 9
312 if(fname == "sum"){ 318 if(fname == "sum"){
313 rows = 11 319 rows = 11
314 } 320 }
315 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) 321 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows)
316 322
317 for(i in 1:length(genes)){ 323 for(i in 1:length(genes)){
318 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) 324 print(paste("Creating table for", fname, genes[i]))
325 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
319 } 326 }
320 327
321 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") 328 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
322 329
323 result = data.frame(matrx) 330 result = data.frame(matrx)
328 } 335 }
329 336
330 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) 337 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
331 } 338 }
332 339
340 print("Adding median number of mutations to sum table")
341
333 sum.table = read.table("mutations_sum.txt", sep=",", header=F) 342 sum.table = read.table("mutations_sum.txt", sep=",", header=F)
334 median.table = read.table("mutations_median.txt", sep=",", header=F) 343 median.table = read.table("mutations_median.txt", sep=",", header=F)
335
336 #sum.table["Median of Number of Mutations (%)",] = median.table[1,]
337 344
338 new.table = sum.table[1,] 345 new.table = sum.table[1,]
339 new.table[2,] = median.table[1,] 346 new.table[2,] = median.table[1,]
340 new.table[3:12,] = sum.table[2:11,] 347 new.table[3:12,] = sum.table[2:11,]
341 new.table[,1] = as.character(new.table[,1]) 348 new.table[,1] = as.character(new.table[,1])
342 new.table[2,1] = "Median of Number of Mutations (%)" 349 new.table[2,1] = "Median of Number of Mutations (%)"
343 350
344 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] 351 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]
345 352
346 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) 353 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)
354
355
356 print("Plotting ca piechart")
347 357
348 dat = dat[!grepl("^unmatched", dat$best_match),] 358 dat = dat[!grepl("^unmatched", dat$best_match),]
349 359
350 #blegh 360 #blegh
351 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match 361 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
363 png(filename="ca.png") 373 png(filename="ca.png")
364 print(pc) 374 print(pc)
365 dev.off() 375 dev.off()
366 } 376 }
367 377
378 print("Plotting cg piechart")
379
368 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match 380 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match
369 if(length(genesForPlot) > 0){ 381 if(length(genesForPlot) > 0){
370 genesForPlot = data.frame(table(genesForPlot)) 382 genesForPlot = data.frame(table(genesForPlot))
371 colnames(genesForPlot) = c("Gene","Freq") 383 colnames(genesForPlot) = c("Gene","Freq")
372 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) 384 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
380 png(filename="cg.png") 392 png(filename="cg.png")
381 print(pc) 393 print(pc)
382 dev.off() 394 dev.off()
383 } 395 }
384 396
397
398 print("Plotting scatterplot")
399
385 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) 400 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
386 401
387 p = ggplot(dat, aes(best_match, percentage_mutations)) 402 p = ggplot(dat, aes(best_match, percentage_mutations))
388 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) 403 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
389 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") 404 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
394 409
395 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) 410 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
396 411
397 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) 412 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
398 413
414
415 print("Plotting frequency ranges plot")
416
399 dat$best_match_class = substr(dat$best_match, 0, 2) 417 dat$best_match_class = substr(dat$best_match, 0, 2)
400 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") 418 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
401 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) 419 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
402 420
403 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) 421 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])