Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.r @ 49:5c6b9e99d576 draft
Uploaded
| author | davidvanzessen | 
|---|---|
| date | Wed, 18 Nov 2015 05:55:04 -0500 | 
| parents | 099cc1254f74 | 
| children | 7290a88ea202 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 48:d09b1bdfd388 | 49:5c6b9e99d576 | 
|---|---|
| 1 library(data.table) | |
| 2 library(ggplot2) | |
| 3 | |
| 1 args <- commandArgs(trailingOnly = TRUE) | 4 args <- commandArgs(trailingOnly = TRUE) | 
| 2 | 5 | 
| 3 input = args[1] | 6 input = args[1] | 
| 4 genes = unlist(strsplit(args[2], ",")) | 7 genes = unlist(strsplit(args[2], ",")) | 
| 5 outputdir = args[3] | 8 outputdir = args[3] | 
| 131 | 134 | 
| 132 | 135 | 
| 133 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") | 136 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") | 
| 134 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) | 137 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) | 
| 135 | 138 | 
| 136 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="") | 139 | 
| 140 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") | |
| 141 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") | |
| 137 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) | 142 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) | 
| 143 | |
| 144 transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="") | |
| 145 dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns) | |
| 146 | |
| 147 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") | |
| 148 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") | |
| 149 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) | |
| 150 | |
| 138 | 151 | 
| 139 FRRegions = regions[grepl("FR", regions)] | 152 FRRegions = regions[grepl("FR", regions)] | 
| 140 CDRRegions = regions[grepl("CDR", regions)] | 153 CDRRegions = regions[grepl("CDR", regions)] | 
| 141 | 154 | 
| 142 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") | 155 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") | 
| 149 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns) | 162 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns) | 
| 150 | 163 | 
| 151 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") | 164 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") | 
| 152 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) | 165 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) | 
| 153 | 166 | 
| 154 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") | 167 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") | 
| 155 | 168 | 
| 156 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) | 169 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 157 | 170 | 
| 158 setwd(outputdir) | 171 setwd(outputdir) | 
| 159 | 172 | 
| 160 nts = c("a", "c", "g", "t") | 173 nts = c("a", "c", "g", "t") | 
| 161 zeros=rep(0, 4) | 174 zeros=rep(0, 4) | 
| 162 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7) | 175 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9) | 
| 163 for(i in 1:length(genes)){ | 176 for(i in 1:length(genes)){ | 
| 164 gene = genes[i] | 177 gene = genes[i] | 
| 165 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] | 178 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] | 
| 166 if(gene == "."){ | 179 if(gene == "."){ | 
| 167 tmp = dat | 180 tmp = dat | 
| 171 y = (j * 3) + 2 | 184 y = (j * 3) + 2 | 
| 172 z = (j * 3) + 3 | 185 z = (j * 3) + 3 | 
| 173 matrx[1,x] = sum(tmp$VRegionMutations) | 186 matrx[1,x] = sum(tmp$VRegionMutations) | 
| 174 matrx[1,y] = sum(tmp$VRegionNucleotides) | 187 matrx[1,y] = sum(tmp$VRegionNucleotides) | 
| 175 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) | 188 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) | 
| 189 | |
| 176 matrx[2,x] = sum(tmp$transitionMutations) | 190 matrx[2,x] = sum(tmp$transitionMutations) | 
| 177 matrx[2,y] = sum(tmp$VRegionMutations) | 191 matrx[2,y] = sum(tmp$VRegionMutations) | 
| 178 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) | 192 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) | 
| 193 | |
| 179 matrx[3,x] = sum(tmp$transversionMutations) | 194 matrx[3,x] = sum(tmp$transversionMutations) | 
| 180 matrx[3,y] = sum(tmp$VRegionMutations) | 195 matrx[3,y] = sum(tmp$VRegionMutations) | 
| 181 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) | 196 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) | 
| 197 | |
| 182 matrx[4,x] = sum(tmp$transitionMutationsAtGC) | 198 matrx[4,x] = sum(tmp$transitionMutationsAtGC) | 
| 183 matrx[4,y] = sum(tmp$totalMutationsAtGC) | 199 matrx[4,y] = sum(tmp$totalMutationsAtGC) | 
| 184 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) | 200 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) | 
| 201 | |
| 185 matrx[5,x] = sum(tmp$totalMutationsAtGC) | 202 matrx[5,x] = sum(tmp$totalMutationsAtGC) | 
| 186 matrx[5,y] = sum(tmp$VRegionMutations) | 203 matrx[5,y] = sum(tmp$VRegionMutations) | 
| 187 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) | 204 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) | 
| 188 matrx[6,x] = sum(tmp$nonSilentMutationsFR) | 205 | 
| 189 matrx[6,y] = sum(tmp$silentMutationsFR) | 206 matrx[6,x] = sum(tmp$transitionMutationsAtAT) | 
| 190 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) | 207 matrx[6,y] = sum(tmp$totalMutationsAtAT) | 
| 191 matrx[7,x] = sum(tmp$nonSilentMutationsCDR) | 208 matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) | 
| 192 matrx[7,y] = sum(tmp$silentMutationsCDR) | 209 | 
| 193 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) | 210 matrx[7,x] = sum(tmp$totalMutationsAtAT) | 
| 211 matrx[7,y] = sum(tmp$VRegionMutations) | |
| 212 matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) | |
| 213 | |
| 214 matrx[8,x] = sum(tmp$nonSilentMutationsFR) | |
| 215 matrx[8,y] = sum(tmp$silentMutationsFR) | |
| 216 matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) | |
| 217 | |
| 218 matrx[9,x] = sum(tmp$nonSilentMutationsCDR) | |
| 219 matrx[9,y] = sum(tmp$silentMutationsCDR) | |
| 220 matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) | |
| 194 | 221 | 
| 195 | 222 | 
| 196 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) | 223 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) | 
| 197 row.names(transitionTable) = c("A", "C", "G", "T") | 224 row.names(transitionTable) = c("A", "C", "G", "T") | 
| 198 transitionTable["A","A"] = NA | 225 transitionTable["A","A"] = NA | 
| 237 y = (j * 3) + 2 | 264 y = (j * 3) + 2 | 
| 238 z = (j * 3) + 3 | 265 z = (j * 3) + 3 | 
| 239 matrx[1,x] = sum(tmp$VRegionMutations) | 266 matrx[1,x] = sum(tmp$VRegionMutations) | 
| 240 matrx[1,y] = sum(tmp$VRegionNucleotides) | 267 matrx[1,y] = sum(tmp$VRegionNucleotides) | 
| 241 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) | 268 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) | 
| 269 | |
| 242 matrx[2,x] = sum(tmp$transitionMutations) | 270 matrx[2,x] = sum(tmp$transitionMutations) | 
| 243 matrx[2,y] = sum(tmp$VRegionMutations) | 271 matrx[2,y] = sum(tmp$VRegionMutations) | 
| 244 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) | 272 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) | 
| 273 | |
| 245 matrx[3,x] = sum(tmp$transversionMutations) | 274 matrx[3,x] = sum(tmp$transversionMutations) | 
| 246 matrx[3,y] = sum(tmp$VRegionMutations) | 275 matrx[3,y] = sum(tmp$VRegionMutations) | 
| 247 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) | 276 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) | 
| 277 | |
| 248 matrx[4,x] = sum(tmp$transitionMutationsAtGC) | 278 matrx[4,x] = sum(tmp$transitionMutationsAtGC) | 
| 249 matrx[4,y] = sum(tmp$totalMutationsAtGC) | 279 matrx[4,y] = sum(tmp$totalMutationsAtGC) | 
| 250 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) | 280 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) | 
| 281 | |
| 251 matrx[5,x] = sum(tmp$totalMutationsAtGC) | 282 matrx[5,x] = sum(tmp$totalMutationsAtGC) | 
| 252 matrx[5,y] = sum(tmp$VRegionMutations) | 283 matrx[5,y] = sum(tmp$VRegionMutations) | 
| 253 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) | 284 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) | 
| 254 matrx[6,x] = sum(tmp$nonSilentMutationsFR) | 285 | 
| 255 matrx[6,y] = sum(tmp$silentMutationsFR) | 286 matrx[6,x] = sum(tmp$transitionMutationsAtAT) | 
| 256 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) | 287 matrx[6,y] = sum(tmp$totalMutationsAtAT) | 
| 257 matrx[7,x] = sum(tmp$nonSilentMutationsCDR) | 288 matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) | 
| 258 matrx[7,y] = sum(tmp$silentMutationsCDR) | 289 | 
| 259 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) | 290 matrx[7,x] = sum(tmp$totalMutationsAtAT) | 
| 291 matrx[7,y] = sum(tmp$VRegionMutations) | |
| 292 matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) | |
| 293 | |
| 294 matrx[8,x] = sum(tmp$nonSilentMutationsFR) | |
| 295 matrx[8,y] = sum(tmp$silentMutationsFR) | |
| 296 matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) | |
| 297 | |
| 298 matrx[9,x] = sum(tmp$nonSilentMutationsCDR) | |
| 299 matrx[9,y] = sum(tmp$silentMutationsCDR) | |
| 300 matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) | |
| 260 | 301 | 
| 261 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) | 302 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) | 
| 262 row.names(transitionTable) = c("A", "C", "G", "T") | 303 row.names(transitionTable) = c("A", "C", "G", "T") | 
| 263 transitionTable["A","A"] = NA | 304 transitionTable["A","A"] = NA | 
| 264 transitionTable["C","C"] = NA | 305 transitionTable["C","C"] = NA | 
| 291 cat(length(tmp$Sequence.ID), file="total_n.txt") | 332 cat(length(tmp$Sequence.ID), file="total_n.txt") | 
| 292 | 333 | 
| 293 | 334 | 
| 294 | 335 | 
| 295 result = data.frame(matrx) | 336 result = data.frame(matrx) | 
| 296 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)", "FR R/S (ratio)", "CDR R/S (ratio)") | 337 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") | 
| 297 | 338 | 
| 298 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | 339 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | 
| 299 | 340 | 
| 300 | 341 | 
| 301 if (!("ggplot2" %in% rownames(installed.packages()))) { | 342 if (!("ggplot2" %in% rownames(installed.packages()))) { | 
| 302 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") | 343 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") | 
| 303 } | 344 } | 
| 304 library(ggplot2) | 345 | 
| 305 | 346 | 
| 306 genesForPlot = gsub("[0-9]", "", dat$best_match) | 347 genesForPlot = gsub("[0-9]", "", dat$best_match) | 
| 307 genesForPlot = data.frame(table(genesForPlot)) | 348 genesForPlot = data.frame(table(genesForPlot)) | 
| 308 colnames(genesForPlot) = c("Gene","Freq") | 349 colnames(genesForPlot) = c("Gene","Freq") | 
| 309 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) | 350 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) | 
| 358 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) | 399 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) | 
| 359 | 400 | 
| 360 p = ggplot(dat, aes(best_match, percentage_mutations)) | 401 p = ggplot(dat, aes(best_match, percentage_mutations)) | 
| 361 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) | 402 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) | 
| 362 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") | 403 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") | 
| 363 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) | |
| 364 | |
| 365 | 404 | 
| 366 png(filename="scatter.png") | 405 png(filename="scatter.png") | 
| 367 print(p) | 406 print(p) | 
| 368 dev.off() | 407 dev.off() | 
| 369 | 408 | 
| 370 | 409 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 371 | 410 | 
| 372 | 411 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) | 
| 373 | 412 | 
| 374 | 413 | 
| 375 | 414 | 
| 376 | 415 | 
| 377 | 416 | 
| 378 | 417 | 
| 379 | 418 dat$best_match_class = substr(dat$best_match, 0, 2) | 
| 380 | 419 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") | 
| 381 | 420 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) | 
| 382 | 421 | 
| 383 | 422 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) | 
| 384 | 423 | 
| 385 | 424 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count)) | 
| 386 | 425 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") | 
| 387 | 426 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") | 
| 388 | 427 | 
| 389 | 428 png(filename="frequency_ranges.png") | 
| 390 | 429 print(p) | 
| 391 | 430 dev.off() | 
| 392 | 431 | 
| 393 | 432 frequency_bins_data_by_class = frequency_bins_data | 
| 394 | 433 | 
| 395 | 434 write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 396 | 435 | 
| 397 | 436 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")]) | 
| 398 | 437 | 
| 399 | 438 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 400 | 439 | 
| 440 | |
| 441 #frequency_bins_data_by_class | |
| 442 #frequency_ranges_subclasses.txt | |
| 443 | |
| 444 | |
| 445 | |
| 446 | |
| 447 | |
| 448 | |
| 449 | |
| 450 | |
| 451 | |
| 452 | |
| 453 | |
| 454 | |
| 455 | |
| 456 | |
| 457 | |
| 458 | |
| 459 | |
| 460 | |
| 461 | |
| 462 | |
| 463 | |
| 464 | |
| 465 | |
| 466 | |
| 467 | |
| 468 | |
| 469 | 
