Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.r @ 118:ad7ca9c2b748 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 11 Aug 2016 08:00:00 -0400 |
parents | ede6c4ee5196 |
children | 626a956f3811 |
comparison
equal
deleted
inserted
replaced
117:a8f91c52411c | 118:ad7ca9c2b748 |
---|---|
103 "FR3.IMGT.Nb.of.silent.mutations", | 103 "FR3.IMGT.Nb.of.silent.mutations", |
104 "FR1.IMGT.Nb.of.nonsilent.mutations", | 104 "FR1.IMGT.Nb.of.nonsilent.mutations", |
105 "FR2.IMGT.Nb.of.nonsilent.mutations", | 105 "FR2.IMGT.Nb.of.nonsilent.mutations", |
106 "FR3.IMGT.Nb.of.nonsilent.mutations") | 106 "FR3.IMGT.Nb.of.nonsilent.mutations") |
107 | 107 |
108 | |
109 print("Cleaning up columns") | |
108 for(col in cleanup_columns){ | 110 for(col in cleanup_columns){ |
109 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) | 111 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) |
110 #dat[dat[,col] == "",] = "0" | 112 #dat[dat[,col] == "",] = "0" |
111 dat[,col] = as.numeric(dat[,col]) | 113 dat[,col] = as.numeric(dat[,col]) |
112 dat[is.na(dat[,col]),col] = 0 | 114 dat[is.na(dat[,col]),col] = 0 |
116 if(!include_fr1){ | 118 if(!include_fr1){ |
117 regions = c("CDR1", "FR2", "CDR2", "FR3") | 119 regions = c("CDR1", "FR2", "CDR2", "FR3") |
118 } | 120 } |
119 | 121 |
120 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } | 122 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } |
123 | |
124 print("aggregating data into new columns") | |
121 | 125 |
122 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") | 126 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") |
123 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) | 127 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) |
124 | 128 |
125 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") | 129 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") |
302 zeros=rep(0, 4) | 306 zeros=rep(0, 4) |
303 | 307 |
304 funcs = c(median, sum, mean) | 308 funcs = c(median, sum, mean) |
305 fnames = c("median", "sum", "mean") | 309 fnames = c("median", "sum", "mean") |
306 | 310 |
311 print("Creating result tables") | |
312 | |
307 for(i in 1:length(funcs)){ | 313 for(i in 1:length(funcs)){ |
308 func = funcs[[i]] | 314 func = funcs[[i]] |
309 fname = fnames[[i]] | 315 fname = fnames[[i]] |
310 | 316 |
311 rows = 9 | 317 rows = 9 |
312 if(fname == "sum"){ | 318 if(fname == "sum"){ |
313 rows = 11 | 319 rows = 11 |
314 } | 320 } |
315 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) | 321 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) |
316 | 322 |
317 for(i in 1:length(genes)){ | 323 for(i in 1:length(genes)){ |
318 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) | 324 print(paste("Creating table for", fname, genes[i])) |
325 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) | |
319 } | 326 } |
320 | 327 |
321 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") | 328 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") |
322 | 329 |
323 result = data.frame(matrx) | 330 result = data.frame(matrx) |
328 } | 335 } |
329 | 336 |
330 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) | 337 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) |
331 } | 338 } |
332 | 339 |
340 print("Adding median number of mutations to sum table") | |
341 | |
333 sum.table = read.table("mutations_sum.txt", sep=",", header=F) | 342 sum.table = read.table("mutations_sum.txt", sep=",", header=F) |
334 median.table = read.table("mutations_median.txt", sep=",", header=F) | 343 median.table = read.table("mutations_median.txt", sep=",", header=F) |
335 | |
336 #sum.table["Median of Number of Mutations (%)",] = median.table[1,] | |
337 | 344 |
338 new.table = sum.table[1,] | 345 new.table = sum.table[1,] |
339 new.table[2,] = median.table[1,] | 346 new.table[2,] = median.table[1,] |
340 new.table[3:12,] = sum.table[2:11,] | 347 new.table[3:12,] = sum.table[2:11,] |
341 new.table[,1] = as.character(new.table[,1]) | 348 new.table[,1] = as.character(new.table[,1]) |
342 new.table[2,1] = "Median of Number of Mutations (%)" | 349 new.table[2,1] = "Median of Number of Mutations (%)" |
343 | 350 |
344 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] | 351 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] |
345 | 352 |
346 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) | 353 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) |
354 | |
355 | |
356 print("Plotting ca piechart") | |
347 | 357 |
348 dat = dat[!grepl("^unmatched", dat$best_match),] | 358 dat = dat[!grepl("^unmatched", dat$best_match),] |
349 | 359 |
350 #blegh | 360 #blegh |
351 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match | 361 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match |
363 png(filename="ca.png") | 373 png(filename="ca.png") |
364 print(pc) | 374 print(pc) |
365 dev.off() | 375 dev.off() |
366 } | 376 } |
367 | 377 |
378 print("Plotting cg piechart") | |
379 | |
368 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match | 380 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match |
369 if(length(genesForPlot) > 0){ | 381 if(length(genesForPlot) > 0){ |
370 genesForPlot = data.frame(table(genesForPlot)) | 382 genesForPlot = data.frame(table(genesForPlot)) |
371 colnames(genesForPlot) = c("Gene","Freq") | 383 colnames(genesForPlot) = c("Gene","Freq") |
372 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) | 384 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) |
380 png(filename="cg.png") | 392 png(filename="cg.png") |
381 print(pc) | 393 print(pc) |
382 dev.off() | 394 dev.off() |
383 } | 395 } |
384 | 396 |
397 | |
398 print("Plotting scatterplot") | |
399 | |
385 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) | 400 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) |
386 | 401 |
387 p = ggplot(dat, aes(best_match, percentage_mutations)) | 402 p = ggplot(dat, aes(best_match, percentage_mutations)) |
388 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) | 403 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) |
389 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") | 404 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") |
394 | 409 |
395 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) | 410 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) |
396 | 411 |
397 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) | 412 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) |
398 | 413 |
414 | |
415 print("Plotting frequency ranges plot") | |
416 | |
399 dat$best_match_class = substr(dat$best_match, 0, 2) | 417 dat$best_match_class = substr(dat$best_match, 0, 2) |
400 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") | 418 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") |
401 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) | 419 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) |
402 | 420 |
403 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) | 421 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) |