comparison mutation_analysis.r @ 0:74d2bc479bee draft

Uploaded
author davidvanzessen
date Mon, 18 Aug 2014 04:04:37 -0400
parents
children 2f4298673519
comparison
equal deleted inserted replaced
-1:000000000000 0:74d2bc479bee
1 args <- commandArgs(trailingOnly = TRUE)
2
3 input = args[1]
4 summaryinput = args[2]
5 outputdir = args[3]
6 setwd(outputdir)
7
8 #dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
9 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
10
11 datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
12 datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")]
13
14 dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
15
16 #dat = dat[dat$Functionality == "productive",]
17
18 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
19 dat$VGene = gsub("[*].*", "", dat$VGene)
20
21 dat$past = paste(dat$AA.JUNCTION, dat$VGene)
22
23 #dat = dat[!duplicated(dat$past), ]
24
25 if(length(dat$Sequence.ID) == 0){
26 setwd(outputdir)
27 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
28 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
29 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
30 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
31 row.names(transitionTable) = c("A", "C", "G", "T")
32 transitionTable["A","A"] = NA
33 transitionTable["C","C"] = NA
34 transitionTable["G","G"] = NA
35 transitionTable["T","T"] = NA
36 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
37 cat("0", file="n.txt")
38 stop("No data")
39 }
40
41 #print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
42
43 result = data.frame(x = 1:5, y = 1:5, z = 1:5)
44 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
45
46 cleanup_columns = c("FR1.IMGT.c.a",
47 "FR2.IMGT.g.t",
48 "CDR1.IMGT.Nb.of.nucleotides",
49 "CDR2.IMGT.t.a",
50 "FR1.IMGT.c.g",
51 "CDR1.IMGT.c.t",
52 "FR2.IMGT.a.c",
53 "FR2.IMGT.Nb.of.mutations",
54 "FR2.IMGT.g.c",
55 "FR2.IMGT.a.g",
56 "FR3.IMGT.t.a",
57 "FR3.IMGT.t.c",
58 "FR2.IMGT.g.a",
59 "FR3.IMGT.c.g",
60 "FR1.IMGT.Nb.of.mutations",
61 "CDR1.IMGT.g.a",
62 "CDR1.IMGT.t.g",
63 "CDR1.IMGT.g.c",
64 "CDR2.IMGT.Nb.of.nucleotides",
65 "FR2.IMGT.a.t",
66 "CDR1.IMGT.Nb.of.mutations",
67 "CDR1.IMGT.a.g",
68 "FR3.IMGT.a.c",
69 "FR1.IMGT.g.a",
70 "FR3.IMGT.a.g",
71 "FR1.IMGT.a.t",
72 "CDR2.IMGT.a.g",
73 "CDR2.IMGT.Nb.of.mutations",
74 "CDR2.IMGT.g.t",
75 "CDR2.IMGT.a.c",
76 "CDR1.IMGT.t.c",
77 "FR3.IMGT.g.c",
78 "FR1.IMGT.g.t",
79 "FR3.IMGT.g.t",
80 "CDR1.IMGT.a.t",
81 "FR1.IMGT.a.g",
82 "FR3.IMGT.a.t",
83 "FR3.IMGT.Nb.of.nucleotides",
84 "FR2.IMGT.t.c",
85 "CDR2.IMGT.g.a",
86 "FR2.IMGT.t.a",
87 "CDR1.IMGT.t.a",
88 "FR2.IMGT.t.g",
89 "FR3.IMGT.t.g",
90 "FR2.IMGT.Nb.of.nucleotides",
91 "FR1.IMGT.t.a",
92 "FR1.IMGT.t.g",
93 "FR3.IMGT.c.t",
94 "FR1.IMGT.t.c",
95 "CDR2.IMGT.a.t",
96 "FR2.IMGT.c.t",
97 "CDR1.IMGT.g.t",
98 "CDR2.IMGT.t.g",
99 "FR1.IMGT.Nb.of.nucleotides",
100 "CDR1.IMGT.c.g",
101 "CDR2.IMGT.t.c",
102 "FR3.IMGT.g.a",
103 "CDR1.IMGT.a.c",
104 "FR2.IMGT.c.a",
105 "FR3.IMGT.Nb.of.mutations",
106 "FR2.IMGT.c.g",
107 "CDR2.IMGT.g.c",
108 "FR1.IMGT.g.c",
109 "CDR2.IMGT.c.t",
110 "FR3.IMGT.c.a",
111 "CDR1.IMGT.c.a",
112 "CDR2.IMGT.c.g",
113 "CDR2.IMGT.c.a",
114 "FR1.IMGT.c.t")
115
116 for(col in cleanup_columns){
117 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
118 #dat[dat[,col] == "",] = "0"
119 dat[,col] = as.numeric(dat[,col])
120 dat[is.na(dat[,col]),] = 0
121 }
122
123 VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations +
124 dat$CDR1.IMGT.Nb.of.mutations +
125 dat$FR2.IMGT.Nb.of.mutations +
126 dat$CDR2.IMGT.Nb.of.mutations +
127 dat$FR3.IMGT.Nb.of.mutations)
128
129 VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
130 dat$CDR1.IMGT.Nb.of.nucleotides +
131 dat$FR2.IMGT.Nb.of.nucleotides +
132 dat$CDR2.IMGT.Nb.of.nucleotides +
133 dat$FR3.IMGT.Nb.of.nucleotides)
134
135 result[1,"x"] = VRegionMutations
136 result[1,"y"] = VRegionNucleotides
137 result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
138
139 transitionMutations = sum(dat$FR1.IMGT.a.g +
140 dat$FR1.IMGT.g.a +
141 dat$FR1.IMGT.c.t +
142 dat$FR1.IMGT.t.c +
143 dat$CDR1.IMGT.a.g +
144 dat$CDR1.IMGT.g.a +
145 dat$CDR1.IMGT.c.t +
146 dat$CDR1.IMGT.t.c +
147 dat$FR2.IMGT.a.g +
148 dat$FR2.IMGT.g.a +
149 dat$FR2.IMGT.c.t +
150 dat$FR2.IMGT.t.c +
151 dat$CDR2.IMGT.a.g +
152 dat$CDR2.IMGT.g.a +
153 dat$CDR2.IMGT.c.t +
154 dat$CDR2.IMGT.t.c +
155 dat$FR3.IMGT.a.g +
156 dat$FR3.IMGT.g.a +
157 dat$FR3.IMGT.c.t +
158 dat$FR3.IMGT.t.c)
159
160 result[2,"x"] = transitionMutations
161 result[2,"y"] = VRegionMutations
162 result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
163
164 transversionMutations = sum( dat$FR1.IMGT.a.c +
165 dat$FR1.IMGT.c.a +
166 dat$FR1.IMGT.a.t +
167 dat$FR1.IMGT.t.a +
168 dat$FR1.IMGT.g.c +
169 dat$FR1.IMGT.c.g +
170 dat$FR1.IMGT.g.t +
171 dat$FR1.IMGT.t.g +
172 dat$CDR1.IMGT.a.c +
173 dat$CDR1.IMGT.c.a +
174 dat$CDR1.IMGT.a.t +
175 dat$CDR1.IMGT.t.a +
176 dat$CDR1.IMGT.g.c +
177 dat$CDR1.IMGT.c.g +
178 dat$CDR1.IMGT.g.t +
179 dat$CDR1.IMGT.t.g +
180 dat$FR2.IMGT.a.c +
181 dat$FR2.IMGT.c.a +
182 dat$FR2.IMGT.a.t +
183 dat$FR2.IMGT.t.a +
184 dat$FR2.IMGT.g.c +
185 dat$FR2.IMGT.c.g +
186 dat$FR2.IMGT.g.t +
187 dat$FR2.IMGT.t.g +
188 dat$CDR2.IMGT.a.c +
189 dat$CDR2.IMGT.c.a +
190 dat$CDR2.IMGT.a.t +
191 dat$CDR2.IMGT.t.a +
192 dat$CDR2.IMGT.g.c +
193 dat$CDR2.IMGT.c.g +
194 dat$CDR2.IMGT.g.t +
195 dat$CDR2.IMGT.t.g +
196 dat$FR3.IMGT.a.c +
197 dat$FR3.IMGT.c.a +
198 dat$FR3.IMGT.a.t +
199 dat$FR3.IMGT.t.a +
200 dat$FR3.IMGT.g.c +
201 dat$FR3.IMGT.c.g +
202 dat$FR3.IMGT.g.t +
203 dat$FR3.IMGT.t.g)
204
205 result[3,"x"] = transversionMutations
206 result[3,"y"] = VRegionMutations
207 result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
208
209 transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
210 dat$FR1.IMGT.c.t +
211 dat$CDR1.IMGT.g.a +
212 dat$CDR1.IMGT.c.t +
213 dat$FR2.IMGT.g.a +
214 dat$FR2.IMGT.c.t +
215 dat$CDR2.IMGT.g.a +
216 dat$CDR2.IMGT.c.t +
217 dat$FR3.IMGT.g.a +
218 dat$FR3.IMGT.c.t)
219
220 totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
221 dat$FR1.IMGT.c.t +
222 dat$FR1.IMGT.c.a +
223 dat$FR1.IMGT.g.c +
224 dat$FR1.IMGT.c.g +
225 dat$FR1.IMGT.g.t +
226 dat$CDR1.IMGT.g.a +
227 dat$CDR1.IMGT.c.t +
228 dat$CDR1.IMGT.c.a +
229 dat$CDR1.IMGT.g.c +
230 dat$CDR1.IMGT.c.g +
231 dat$CDR1.IMGT.g.t +
232 dat$FR2.IMGT.g.a +
233 dat$FR2.IMGT.c.t +
234 dat$FR2.IMGT.c.a +
235 dat$FR2.IMGT.g.c +
236 dat$FR2.IMGT.c.g +
237 dat$FR2.IMGT.g.t +
238 dat$CDR2.IMGT.g.a +
239 dat$CDR2.IMGT.c.t +
240 dat$CDR2.IMGT.c.a +
241 dat$CDR2.IMGT.g.c +
242 dat$CDR2.IMGT.c.g +
243 dat$CDR2.IMGT.g.t +
244 dat$FR3.IMGT.g.a +
245 dat$FR3.IMGT.c.t +
246 dat$FR3.IMGT.c.a +
247 dat$FR3.IMGT.g.c +
248 dat$FR3.IMGT.c.g +
249 dat$FR3.IMGT.g.t)
250
251 result[4,"x"] = transitionMutationsAtGC
252 result[4,"y"] = totalMutationsAtGC
253 result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
254
255 result[5,"x"] = totalMutationsAtGC
256 result[5,"y"] = VRegionMutations
257 result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
258
259
260 #transitiontable
261
262 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
263 row.names(transitionTable) = c("A", "C", "G", "T")
264 transitionTable["A","A"] = NA
265 transitionTable["C","C"] = NA
266 transitionTable["G","G"] = NA
267 transitionTable["T","T"] = NA
268 nts = c("a", "c", "g", "t")
269
270
271 for(nt1 in nts){
272 for(nt2 in nts){
273 if(nt1 == nt2){
274 next
275 }
276 NT1 = LETTERS[letters == nt1]
277 NT2 = LETTERS[letters == nt2]
278 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
279 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
280 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
281 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
282 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
283 transitionTable[NT1,NT2] = sum( dat[,FR1] +
284 dat[,CDR1] +
285 dat[,FR2] +
286 dat[,CDR2] +
287 dat[,FR3])
288 }
289 }
290
291 setwd(outputdir)
292 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
293 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
294 cat(length(dat$Sequence.ID), file="n.txt")
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328