comparison mutation_analysis.r @ 11:0510cf1f7cbc draft

Uploaded
author davidvanzessen
date Tue, 04 Aug 2015 09:59:26 -0400
parents
children
comparison
equal deleted inserted replaced
10:edbf4fba5fc7 11:0510cf1f7cbc
1 args <- commandArgs(trailingOnly = TRUE)
2
3 input = args[1]
4 genes = unlist(strsplit(args[2], ","))
5 outputdir = args[3]
6 print(args[4])
7 include_fr1 = ifelse(args[4] == "yes", T, F)
8 setwd(outputdir)
9
10 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
11
12 if(length(dat$Sequence.ID) == 0){
13 setwd(outputdir)
14 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
15 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
16 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
17 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
18 row.names(transitionTable) = c("A", "C", "G", "T")
19 transitionTable["A","A"] = NA
20 transitionTable["C","C"] = NA
21 transitionTable["G","G"] = NA
22 transitionTable["T","T"] = NA
23 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
24 cat("0", file="n.txt")
25 stop("No data")
26 }
27
28
29
30 cleanup_columns = c("FR1.IMGT.c.a",
31 "FR2.IMGT.g.t",
32 "CDR1.IMGT.Nb.of.nucleotides",
33 "CDR2.IMGT.t.a",
34 "FR1.IMGT.c.g",
35 "CDR1.IMGT.c.t",
36 "FR2.IMGT.a.c",
37 "FR2.IMGT.Nb.of.mutations",
38 "FR2.IMGT.g.c",
39 "FR2.IMGT.a.g",
40 "FR3.IMGT.t.a",
41 "FR3.IMGT.t.c",
42 "FR2.IMGT.g.a",
43 "FR3.IMGT.c.g",
44 "FR1.IMGT.Nb.of.mutations",
45 "CDR1.IMGT.g.a",
46 "CDR1.IMGT.t.g",
47 "CDR1.IMGT.g.c",
48 "CDR2.IMGT.Nb.of.nucleotides",
49 "FR2.IMGT.a.t",
50 "CDR1.IMGT.Nb.of.mutations",
51 "CDR1.IMGT.a.g",
52 "FR3.IMGT.a.c",
53 "FR1.IMGT.g.a",
54 "FR3.IMGT.a.g",
55 "FR1.IMGT.a.t",
56 "CDR2.IMGT.a.g",
57 "CDR2.IMGT.Nb.of.mutations",
58 "CDR2.IMGT.g.t",
59 "CDR2.IMGT.a.c",
60 "CDR1.IMGT.t.c",
61 "FR3.IMGT.g.c",
62 "FR1.IMGT.g.t",
63 "FR3.IMGT.g.t",
64 "CDR1.IMGT.a.t",
65 "FR1.IMGT.a.g",
66 "FR3.IMGT.a.t",
67 "FR3.IMGT.Nb.of.nucleotides",
68 "FR2.IMGT.t.c",
69 "CDR2.IMGT.g.a",
70 "FR2.IMGT.t.a",
71 "CDR1.IMGT.t.a",
72 "FR2.IMGT.t.g",
73 "FR3.IMGT.t.g",
74 "FR2.IMGT.Nb.of.nucleotides",
75 "FR1.IMGT.t.a",
76 "FR1.IMGT.t.g",
77 "FR3.IMGT.c.t",
78 "FR1.IMGT.t.c",
79 "CDR2.IMGT.a.t",
80 "FR2.IMGT.c.t",
81 "CDR1.IMGT.g.t",
82 "CDR2.IMGT.t.g",
83 "FR1.IMGT.Nb.of.nucleotides",
84 "CDR1.IMGT.c.g",
85 "CDR2.IMGT.t.c",
86 "FR3.IMGT.g.a",
87 "CDR1.IMGT.a.c",
88 "FR2.IMGT.c.a",
89 "FR3.IMGT.Nb.of.mutations",
90 "FR2.IMGT.c.g",
91 "CDR2.IMGT.g.c",
92 "FR1.IMGT.g.c",
93 "CDR2.IMGT.c.t",
94 "FR3.IMGT.c.a",
95 "CDR1.IMGT.c.a",
96 "CDR2.IMGT.c.g",
97 "CDR2.IMGT.c.a",
98 "FR1.IMGT.c.t")
99
100 for(col in cleanup_columns){
101 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
102 #dat[dat[,col] == "",] = "0"
103 dat[,col] = as.numeric(dat[,col])
104 dat[is.na(dat[,col]),] = 0
105 }
106
107 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")
108 if(!include_fr1){
109 regions = c("CDR1", "FR2", "CDR2", "FR3")
110 }
111
112 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
113
114 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")
115 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)
116
117 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")
118 dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns)
119
120 transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="")
121 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns)
122
123 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="")
124 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns)
125
126
127 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
128 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
129
130 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="")
131 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
132
133 FRRegions = regions[grepl("FR", regions)]
134 CDRRegions = regions[grepl("CDR", regions)]
135
136 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
137 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns)
138
139 CDR_silentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
140 dat$silentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_silentMutations_columns)
141
142 FR_nonSilentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
143 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns)
144
145 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
146 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns)
147
148
149 setwd(outputdir)
150
151 nts = c("a", "c", "g", "t")
152 zeros=rep(0, 4)
153 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7)
154 for(i in 1:length(genes)){
155 gene = genes[i]
156 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
157 if(gene == "."){
158 tmp = dat
159 }
160 j = i - 1
161 x = (j * 3) + 1
162 y = (j * 3) + 2
163 z = (j * 3) + 3
164 matrx[1,x] = sum(tmp$VRegionMutations)
165 matrx[1,y] = sum(tmp$VRegionNucleotides)
166 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
167 matrx[2,x] = sum(tmp$transitionMutations)
168 matrx[2,y] = sum(tmp$VRegionMutations)
169 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
170 matrx[3,x] = sum(tmp$transversionMutations)
171 matrx[3,y] = sum(tmp$VRegionMutations)
172 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
173 matrx[4,x] = sum(tmp$transitionMutationsAtGC)
174 matrx[4,y] = sum(tmp$totalMutationsAtGC)
175 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
176 matrx[5,x] = sum(tmp$totalMutationsAtGC)
177 matrx[5,y] = sum(tmp$VRegionMutations)
178 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
179 matrx[6,x] = sum(tmp$nonSilentMutationsFR)
180 matrx[6,y] = sum(tmp$silentMutationsFR)
181 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1)
182 matrx[7,x] = sum(tmp$nonSilentMutationsCDR)
183 matrx[7,y] = sum(tmp$silentMutationsCDR)
184 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1)
185
186
187 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros)
188 row.names(transitionTable) = c("A", "C", "G", "T")
189 transitionTable["A","A"] = NA
190 transitionTable["C","C"] = NA
191 transitionTable["G","G"] = NA
192 transitionTable["T","T"] = NA
193
194 if(nrow(tmp) > 0){
195 for(nt1 in nts){
196 for(nt2 in nts){
197 if(nt1 == nt2){
198 next
199 }
200 NT1 = LETTERS[letters == nt1]
201 NT2 = LETTERS[letters == nt2]
202 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
203 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
204 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
205 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
206 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
207 if(include_fr1){
208 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
209 } else {
210 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
211 }
212 }
213 }
214 }
215
216
217 write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
218 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
219
220 cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep=""))
221 cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep=""))
222 }
223
224 #again for all of the data
225 tmp = dat
226 j = i
227 x = (j * 3) + 1
228 y = (j * 3) + 2
229 z = (j * 3) + 3
230 matrx[1,x] = sum(tmp$VRegionMutations)
231 matrx[1,y] = sum(tmp$VRegionNucleotides)
232 matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
233 matrx[2,x] = sum(tmp$transitionMutations)
234 matrx[2,y] = sum(tmp$VRegionMutations)
235 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
236 matrx[3,x] = sum(tmp$transversionMutations)
237 matrx[3,y] = sum(tmp$VRegionMutations)
238 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
239 matrx[4,x] = sum(tmp$transitionMutationsAtGC)
240 matrx[4,y] = sum(tmp$totalMutationsAtGC)
241 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
242 matrx[5,x] = sum(tmp$totalMutationsAtGC)
243 matrx[5,y] = sum(tmp$VRegionMutations)
244 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
245 matrx[6,x] = sum(tmp$nonSilentMutationsFR)
246 matrx[6,y] = sum(tmp$silentMutationsFR)
247 matrx[6,z] = round(matrx[6,x] / matrx[6,y], digits=1)
248 matrx[7,x] = sum(tmp$nonSilentMutationsCDR)
249 matrx[7,y] = sum(tmp$silentMutationsCDR)
250 matrx[7,z] = round(matrx[7,x] / matrx[7,y], digits=1)
251
252 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
253 row.names(transitionTable) = c("A", "C", "G", "T")
254 transitionTable["A","A"] = NA
255 transitionTable["C","C"] = NA
256 transitionTable["G","G"] = NA
257 transitionTable["T","T"] = NA
258
259
260 for(nt1 in nts){
261 for(nt2 in nts){
262 if(nt1 == nt2){
263 next
264 }
265 NT1 = LETTERS[letters == nt1]
266 NT2 = LETTERS[letters == nt2]
267 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
268 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
269 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
270 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
271 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
272 if(include_fr1){
273 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
274 } else {
275 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
276 }
277 }
278 }
279 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
280 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T)
281 cat(matrx[1,x], file="total_value.txt")
282 cat(length(tmp$Sequence.ID), file="total_n.txt")
283
284
285
286 result = data.frame(matrx)
287 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
288
289 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
290
291
292 if (!("ggplot2" %in% rownames(installed.packages()))) {
293 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/")
294 }
295 library(ggplot2)
296
297 genesForPlot = gsub("[0-9]", "", dat$best_match)
298 genesForPlot = data.frame(table(genesForPlot))
299 colnames(genesForPlot) = c("Gene","Freq")
300 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
301 write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T)
302
303
304 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
305 pc = pc + geom_bar(width = 1, stat = "identity")
306 pc = pc + coord_polar(theta="y")
307 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")"))
308
309 png(filename="all.png")
310 pc
311 dev.off()
312
313
314 #blegh
315 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
316 if(length(genesForPlot) > 0){
317 genesForPlot = data.frame(table(genesForPlot))
318 colnames(genesForPlot) = c("Gene","Freq")
319 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
320
321 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
322 pc = pc + geom_bar(width = 1, stat = "identity")
323 pc = pc + coord_polar(theta="y")
324 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")"))
325 write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T)
326
327 png(filename="ca.png")
328 print(pc)
329 dev.off()
330 }
331
332 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match
333 if(length(genesForPlot) > 0){
334 genesForPlot = data.frame(table(genesForPlot))
335 colnames(genesForPlot) = c("Gene","Freq")
336 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
337
338 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
339 pc = pc + geom_bar(width = 1, stat = "identity")
340 pc = pc + coord_polar(theta="y")
341 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")"))
342 write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T)
343
344 png(filename="cg.png")
345 print(pc)
346 dev.off()
347 }
348
349 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
350
351 p = ggplot(dat, aes(best_match, percentage_mutations))
352 p = p + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) + geom_point(aes(colour=best_match), position="jitter")
353 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
354 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
355
356
357 png(filename="scatter.png")
358 print(p)
359 dev.off()
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391