comparison mutation_analysis.r @ 49:5c6b9e99d576 draft

Uploaded
author davidvanzessen
date Wed, 18 Nov 2015 05:55:04 -0500
parents 099cc1254f74
children 7290a88ea202
comparison
equal deleted inserted replaced
48:d09b1bdfd388 49:5c6b9e99d576
1 library(data.table)
2 library(ggplot2)
3
1 args <- commandArgs(trailingOnly = TRUE) 4 args <- commandArgs(trailingOnly = TRUE)
2 5
3 input = args[1] 6 input = args[1]
4 genes = unlist(strsplit(args[2], ",")) 7 genes = unlist(strsplit(args[2], ","))
5 outputdir = args[3] 8 outputdir = args[3]
131 134
132 135
133 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") 136 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
134 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) 137 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
135 138
136 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="") 139
140 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="")
141 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="")
137 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) 142 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
143
144 transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="")
145 dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns)
146
147 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="")
148 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="")
149 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns)
150
138 151
139 FRRegions = regions[grepl("FR", regions)] 152 FRRegions = regions[grepl("FR", regions)]
140 CDRRegions = regions[grepl("CDR", regions)] 153 CDRRegions = regions[grepl("CDR", regions)]
141 154
142 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") 155 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
149 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns) 162 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns)
150 163
151 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") 164 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
152 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) 165 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns)
153 166
154 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") 167 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR")
155 168
156 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) 169 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T)
157 170
158 setwd(outputdir) 171 setwd(outputdir)
159 172
160 nts = c("a", "c", "g", "t") 173 nts = c("a", "c", "g", "t")
161 zeros=rep(0, 4) 174 zeros=rep(0, 4)
162 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7) 175 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=9)
163 for(i in 1:length(genes)){ 176 for(i in 1:length(genes)){
164 gene = genes[i] 177 gene = genes[i]
165 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] 178 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
166 if(gene == "."){ 179 if(gene == "."){
167 tmp = dat 180 tmp = dat
171 y = (j * 3) + 2 184 y = (j * 3) + 2
172 z = (j * 3) + 3 185 z = (j * 3) + 3
173 matrx[1,x] = sum(tmp$VRegionMutations) 186 matrx[1,x] = sum(tmp$VRegionMutations)
174 matrx[1,y] = sum(tmp$VRegionNucleotides) 187 matrx[1,y] = sum(tmp$VRegionNucleotides)
175 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) 188 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
189
176 matrx[2,x] = sum(tmp$transitionMutations) 190 matrx[2,x] = sum(tmp$transitionMutations)
177 matrx[2,y] = sum(tmp$VRegionMutations) 191 matrx[2,y] = sum(tmp$VRegionMutations)
178 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) 192 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
193
179 matrx[3,x] = sum(tmp$transversionMutations) 194 matrx[3,x] = sum(tmp$transversionMutations)
180 matrx[3,y] = sum(tmp$VRegionMutations) 195 matrx[3,y] = sum(tmp$VRegionMutations)
181 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) 196 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
197
182 matrx[4,x] = sum(tmp$transitionMutationsAtGC) 198 matrx[4,x] = sum(tmp$transitionMutationsAtGC)
183 matrx[4,y] = sum(tmp$totalMutationsAtGC) 199 matrx[4,y] = sum(tmp$totalMutationsAtGC)
184 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) 200 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
201
185 matrx[5,x] = sum(tmp$totalMutationsAtGC) 202 matrx[5,x] = sum(tmp$totalMutationsAtGC)
186 matrx[5,y] = sum(tmp$VRegionMutations) 203 matrx[5,y] = sum(tmp$VRegionMutations)
187 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) 204 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
188 matrx[6,x] = sum(tmp$nonSilentMutationsFR) 205
189 matrx[6,y] = sum(tmp$silentMutationsFR) 206 matrx[6,x] = sum(tmp$transitionMutationsAtAT)
190 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) 207 matrx[6,y] = sum(tmp$totalMutationsAtAT)
191 matrx[7,x] = sum(tmp$nonSilentMutationsCDR) 208 matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1)
192 matrx[7,y] = sum(tmp$silentMutationsCDR) 209
193 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) 210 matrx[7,x] = sum(tmp$totalMutationsAtAT)
211 matrx[7,y] = sum(tmp$VRegionMutations)
212 matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1)
213
214 matrx[8,x] = sum(tmp$nonSilentMutationsFR)
215 matrx[8,y] = sum(tmp$silentMutationsFR)
216 matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1)
217
218 matrx[9,x] = sum(tmp$nonSilentMutationsCDR)
219 matrx[9,y] = sum(tmp$silentMutationsCDR)
220 matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1)
194 221
195 222
196 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) 223 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros)
197 row.names(transitionTable) = c("A", "C", "G", "T") 224 row.names(transitionTable) = c("A", "C", "G", "T")
198 transitionTable["A","A"] = NA 225 transitionTable["A","A"] = NA
237 y = (j * 3) + 2 264 y = (j * 3) + 2
238 z = (j * 3) + 3 265 z = (j * 3) + 3
239 matrx[1,x] = sum(tmp$VRegionMutations) 266 matrx[1,x] = sum(tmp$VRegionMutations)
240 matrx[1,y] = sum(tmp$VRegionNucleotides) 267 matrx[1,y] = sum(tmp$VRegionNucleotides)
241 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) 268 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
269
242 matrx[2,x] = sum(tmp$transitionMutations) 270 matrx[2,x] = sum(tmp$transitionMutations)
243 matrx[2,y] = sum(tmp$VRegionMutations) 271 matrx[2,y] = sum(tmp$VRegionMutations)
244 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) 272 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
273
245 matrx[3,x] = sum(tmp$transversionMutations) 274 matrx[3,x] = sum(tmp$transversionMutations)
246 matrx[3,y] = sum(tmp$VRegionMutations) 275 matrx[3,y] = sum(tmp$VRegionMutations)
247 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) 276 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
277
248 matrx[4,x] = sum(tmp$transitionMutationsAtGC) 278 matrx[4,x] = sum(tmp$transitionMutationsAtGC)
249 matrx[4,y] = sum(tmp$totalMutationsAtGC) 279 matrx[4,y] = sum(tmp$totalMutationsAtGC)
250 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) 280 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
281
251 matrx[5,x] = sum(tmp$totalMutationsAtGC) 282 matrx[5,x] = sum(tmp$totalMutationsAtGC)
252 matrx[5,y] = sum(tmp$VRegionMutations) 283 matrx[5,y] = sum(tmp$VRegionMutations)
253 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) 284 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
254 matrx[6,x] = sum(tmp$nonSilentMutationsFR) 285
255 matrx[6,y] = sum(tmp$silentMutationsFR) 286 matrx[6,x] = sum(tmp$transitionMutationsAtAT)
256 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1) 287 matrx[6,y] = sum(tmp$totalMutationsAtAT)
257 matrx[7,x] = sum(tmp$nonSilentMutationsCDR) 288 matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1)
258 matrx[7,y] = sum(tmp$silentMutationsCDR) 289
259 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1) 290 matrx[7,x] = sum(tmp$totalMutationsAtAT)
291 matrx[7,y] = sum(tmp$VRegionMutations)
292 matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1)
293
294 matrx[8,x] = sum(tmp$nonSilentMutationsFR)
295 matrx[8,y] = sum(tmp$silentMutationsFR)
296 matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1)
297
298 matrx[9,x] = sum(tmp$nonSilentMutationsCDR)
299 matrx[9,y] = sum(tmp$silentMutationsCDR)
300 matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1)
260 301
261 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) 302 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
262 row.names(transitionTable) = c("A", "C", "G", "T") 303 row.names(transitionTable) = c("A", "C", "G", "T")
263 transitionTable["A","A"] = NA 304 transitionTable["A","A"] = NA
264 transitionTable["C","C"] = NA 305 transitionTable["C","C"] = NA
291 cat(length(tmp$Sequence.ID), file="total_n.txt") 332 cat(length(tmp$Sequence.ID), file="total_n.txt")
292 333
293 334
294 335
295 result = data.frame(matrx) 336 result = data.frame(matrx)
296 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)", "FR R/S (ratio)", "CDR R/S (ratio)") 337 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
297 338
298 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) 339 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
299 340
300 341
301 if (!("ggplot2" %in% rownames(installed.packages()))) { 342 if (!("ggplot2" %in% rownames(installed.packages()))) {
302 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 343 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/")
303 } 344 }
304 library(ggplot2) 345
305 346
306 genesForPlot = gsub("[0-9]", "", dat$best_match) 347 genesForPlot = gsub("[0-9]", "", dat$best_match)
307 genesForPlot = data.frame(table(genesForPlot)) 348 genesForPlot = data.frame(table(genesForPlot))
308 colnames(genesForPlot) = c("Gene","Freq") 349 colnames(genesForPlot) = c("Gene","Freq")
309 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) 350 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
358 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) 399 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
359 400
360 p = ggplot(dat, aes(best_match, percentage_mutations)) 401 p = ggplot(dat, aes(best_match, percentage_mutations))
361 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) 402 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
362 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") 403 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
363 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
364
365 404
366 png(filename="scatter.png") 405 png(filename="scatter.png")
367 print(p) 406 print(p)
368 dev.off() 407 dev.off()
369 408
370 409 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
371 410
372 411 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
373 412
374 413
375 414
376 415
377 416
378 417
379 418 dat$best_match_class = substr(dat$best_match, 0, 2)
380 419 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
381 420 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
382 421
383 422 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])
384 423
385 424 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count))
386 425 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge")
387 426 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class")
388 427
389 428 png(filename="frequency_ranges.png")
390 429 print(p)
391 430 dev.off()
392 431
393 432 frequency_bins_data_by_class = frequency_bins_data
394 433
395 434 write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T)
396 435
397 436 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")])
398 437
399 438 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T)
400 439
440
441 #frequency_bins_data_by_class
442 #frequency_ranges_subclasses.txt
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469