comparison mutation_analysis.r @ 22:d84c9791d8c4 draft

Uploaded
author davidvanzessen
date Tue, 07 Apr 2015 03:52:34 -0400
parents cb7c65e3e43f
children 28b8d980db22
comparison
equal deleted inserted replaced
21:c9f9623f1f76 22:d84c9791d8c4
1 args <- commandArgs(trailingOnly = TRUE) 1 args <- commandArgs(trailingOnly = TRUE)
2 2
3 input = args[1] 3 input = args[1]
4 genes = unlist(strsplit(args[2], ",")) 4 genes = unlist(strsplit(args[2], ","))
5 outputdir = args[3] 5 outputdir = args[3]
6 print(args[4])
7 include_fr1 = ifelse(args[4] == "yes", T, F)
6 setwd(outputdir) 8 setwd(outputdir)
7 9
8 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) 10 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
9 11
10 if(length(dat$Sequence.ID) == 0){ 12 if(length(dat$Sequence.ID) == 0){
100 #dat[dat[,col] == "",] = "0" 102 #dat[dat[,col] == "",] = "0"
101 dat[,col] = as.numeric(dat[,col]) 103 dat[,col] = as.numeric(dat[,col])
102 dat[is.na(dat[,col]),] = 0 104 dat[is.na(dat[,col]),] = 0
103 } 105 }
104 106
105 dat$VRegionMutations = dat$CDR1.IMGT.Nb.of.mutations + 107 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")
106 dat$FR2.IMGT.Nb.of.mutations + 108 if(!include_fr1){
107 dat$CDR2.IMGT.Nb.of.mutations + 109 regions = c("CDR1", "FR2", "CDR2", "FR3")
108 dat$FR3.IMGT.Nb.of.mutations 110 }
109 111
110 dat$VRegionNucleotides = dat$CDR1.IMGT.Nb.of.nucleotides + 112 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
111 dat$FR2.IMGT.Nb.of.nucleotides + 113
112 dat$CDR2.IMGT.Nb.of.nucleotides + 114 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")
113 dat$FR3.IMGT.Nb.of.nucleotides 115 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)
114 116
115 dat$transitionMutations = dat$CDR1.IMGT.a.g + 117 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")
116 dat$CDR1.IMGT.g.a + 118 dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns)
117 dat$CDR1.IMGT.c.t + 119
118 dat$CDR1.IMGT.t.c + 120 transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="")
119 dat$FR2.IMGT.a.g + 121 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns)
120 dat$FR2.IMGT.g.a + 122
121 dat$FR2.IMGT.c.t + 123 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="")
122 dat$FR2.IMGT.t.c + 124 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns)
123 dat$CDR2.IMGT.a.g + 125
124 dat$CDR2.IMGT.g.a + 126
125 dat$CDR2.IMGT.c.t + 127 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
126 dat$CDR2.IMGT.t.c + 128 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
127 dat$FR3.IMGT.a.g + 129
128 dat$FR3.IMGT.g.a + 130 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="")
129 dat$FR3.IMGT.c.t + 131 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
130 dat$FR3.IMGT.t.c
131
132 dat$transversionMutations = dat$CDR1.IMGT.a.c +
133 dat$CDR1.IMGT.c.a +
134 dat$CDR1.IMGT.a.t +
135 dat$CDR1.IMGT.t.a +
136 dat$CDR1.IMGT.g.c +
137 dat$CDR1.IMGT.c.g +
138 dat$CDR1.IMGT.g.t +
139 dat$CDR1.IMGT.t.g +
140 dat$FR2.IMGT.a.c +
141 dat$FR2.IMGT.c.a +
142 dat$FR2.IMGT.a.t +
143 dat$FR2.IMGT.t.a +
144 dat$FR2.IMGT.g.c +
145 dat$FR2.IMGT.c.g +
146 dat$FR2.IMGT.g.t +
147 dat$FR2.IMGT.t.g +
148 dat$CDR2.IMGT.a.c +
149 dat$CDR2.IMGT.c.a +
150 dat$CDR2.IMGT.a.t +
151 dat$CDR2.IMGT.t.a +
152 dat$CDR2.IMGT.g.c +
153 dat$CDR2.IMGT.c.g +
154 dat$CDR2.IMGT.g.t +
155 dat$CDR2.IMGT.t.g +
156 dat$FR3.IMGT.a.c +
157 dat$FR3.IMGT.c.a +
158 dat$FR3.IMGT.a.t +
159 dat$FR3.IMGT.t.a +
160 dat$FR3.IMGT.g.c +
161 dat$FR3.IMGT.c.g +
162 dat$FR3.IMGT.g.t +
163 dat$FR3.IMGT.t.g
164
165
166 dat$transitionMutationsAtGC = dat$CDR1.IMGT.g.a +
167 dat$CDR1.IMGT.c.t +
168 dat$FR2.IMGT.g.a +
169 dat$FR2.IMGT.c.t +
170 dat$CDR2.IMGT.g.a +
171 dat$CDR2.IMGT.c.t +
172 dat$FR3.IMGT.g.a +
173 dat$FR3.IMGT.c.t
174
175 dat$totalMutationsAtGC = dat$CDR1.IMGT.g.a +
176 dat$CDR1.IMGT.c.t +
177 dat$CDR1.IMGT.c.a +
178 dat$CDR1.IMGT.g.c +
179 dat$CDR1.IMGT.c.g +
180 dat$CDR1.IMGT.g.t +
181 dat$FR2.IMGT.g.a +
182 dat$FR2.IMGT.c.t +
183 dat$FR2.IMGT.c.a +
184 dat$FR2.IMGT.g.c +
185 dat$FR2.IMGT.c.g +
186 dat$FR2.IMGT.g.t +
187 dat$CDR2.IMGT.g.a +
188 dat$CDR2.IMGT.c.t +
189 dat$CDR2.IMGT.c.a +
190 dat$CDR2.IMGT.g.c +
191 dat$CDR2.IMGT.c.g +
192 dat$CDR2.IMGT.g.t +
193 dat$FR3.IMGT.g.a +
194 dat$FR3.IMGT.c.t +
195 dat$FR3.IMGT.c.a +
196 dat$FR3.IMGT.g.c +
197 dat$FR3.IMGT.c.g +
198 dat$FR3.IMGT.g.t
199 132
200 133
201 134
202 setwd(outputdir) 135 setwd(outputdir)
203 136
137 nts = c("a", "t", "g", "c")
138 zeros=rep(0, 4)
204 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5) 139 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5)
205 for(i in 1:length(genes)){ 140 for(i in 1:length(genes)){
206 gene = genes[i] 141 gene = genes[i]
207 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] 142 tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
208 if(gene == "."){ 143 if(gene == "."){
209 tmp = dat 144 tmp = dat
210 }
211 if(length(tmp) == 0){
212 cat("0", file=paste(gene, "_value.txt" ,sep=""))
213 next
214 } 145 }
215 j = i - 1 146 j = i - 1
216 x = (j * 3) + 1 147 x = (j * 3) + 1
217 y = (j * 3) + 2 148 y = (j * 3) + 2
218 z = (j * 3) + 3 149 z = (j * 3) + 3
230 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) 161 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
231 matrx[5,x] = sum(tmp$totalMutationsAtGC) 162 matrx[5,x] = sum(tmp$totalMutationsAtGC)
232 matrx[5,y] = sum(tmp$VRegionMutations) 163 matrx[5,y] = sum(tmp$VRegionMutations)
233 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) 164 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
234 165
235 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) 166 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros)
236 row.names(transitionTable) = c("A", "C", "G", "T") 167 row.names(transitionTable) = c("A", "C", "G", "T")
237 transitionTable["A","A"] = NA 168 transitionTable["A","A"] = NA
238 transitionTable["C","C"] = NA 169 transitionTable["C","C"] = NA
239 transitionTable["G","G"] = NA 170 transitionTable["G","G"] = NA
240 transitionTable["T","T"] = NA 171 transitionTable["T","T"] = NA
241 nts = c("a", "c", "g", "t") 172
173 if(nrow(tmp) > 0){
174 for(nt1 in nts){
175 for(nt2 in nts){
176 if(nt1 == nt2){
177 next
178 }
179 NT1 = LETTERS[letters == nt1]
180 NT2 = LETTERS[letters == nt2]
181 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
182 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
183 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
184 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
185 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
186 if(include_fr1){
187 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
188 } else {
189 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
190 }
191 }
192 }
193 }
242 194
243 195
244 for(nt1 in nts){
245 for(nt2 in nts){
246 if(nt1 == nt2){
247 next
248 }
249 NT1 = LETTERS[letters == nt1]
250 NT2 = LETTERS[letters == nt2]
251 FR1 = 0 #paste("FR1.IMGT.", nt1, ".", nt2, sep="")
252 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
253 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
254 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
255 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
256 transitionTable[NT1,NT2] = sum( tmp[,CDR1] +
257 tmp[,FR2] +
258 tmp[,CDR2] +
259 tmp[,FR3])
260 }
261 }
262 write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 196 write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
263 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) 197 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
264 198
265 cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep="")) 199 cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep=""))
266 cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep="")) 200 cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep=""))
292 row.names(transitionTable) = c("A", "C", "G", "T") 226 row.names(transitionTable) = c("A", "C", "G", "T")
293 transitionTable["A","A"] = NA 227 transitionTable["A","A"] = NA
294 transitionTable["C","C"] = NA 228 transitionTable["C","C"] = NA
295 transitionTable["G","G"] = NA 229 transitionTable["G","G"] = NA
296 transitionTable["T","T"] = NA 230 transitionTable["T","T"] = NA
297 nts = c("a", "c", "g", "t")
298 231
299 232
300 for(nt1 in nts){ 233 for(nt1 in nts){
301 for(nt2 in nts){ 234 for(nt2 in nts){
302 if(nt1 == nt2){ 235 if(nt1 == nt2){
307 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") 240 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
308 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") 241 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
309 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") 242 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
310 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") 243 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
311 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") 244 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
312 transitionTable[NT1,NT2] = sum( tmp[,CDR1] + 245 if(include_fr1){
313 tmp[,FR2] + 246 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
314 tmp[,CDR2] + 247 } else {
315 tmp[,FR3]) 248 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
249 }
316 } 250 }
317 } 251 }
318 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) 252 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
319 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T) 253 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T)
320 cat(matrx[1,x], file="total_value.txt") 254 cat(matrx[1,x], file="total_value.txt")
380 314
381 png(filename="cg.png") 315 png(filename="cg.png")
382 print(pc) 316 print(pc)
383 dev.off() 317 dev.off()
384 } 318 }
319
320 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
321
322 p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales)
323 p = p + geom_point(aes(colour=best_match), position="jitter")
324 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
325
326 png(filename="scatter.png")
327 print(p)
328 dev.off()
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360