0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 input = args[1]
|
|
4 summaryinput = args[2]
|
|
5 outputdir = args[3]
|
|
6 setwd(outputdir)
|
|
7
|
|
8 #dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
|
|
9 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
|
|
10
|
|
11 datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
|
|
12 datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")]
|
|
13
|
|
14 dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
|
|
15
|
|
16 #dat = dat[dat$Functionality == "productive",]
|
|
17
|
|
18 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
|
|
19 dat$VGene = gsub("[*].*", "", dat$VGene)
|
|
20
|
|
21 dat$past = paste(dat$AA.JUNCTION, dat$VGene)
|
|
22
|
|
23 #dat = dat[!duplicated(dat$past), ]
|
|
24
|
|
25 if(length(dat$Sequence.ID) == 0){
|
|
26 setwd(outputdir)
|
|
27 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
|
|
28 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
|
|
29 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
|
|
30 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
|
|
31 row.names(transitionTable) = c("A", "C", "G", "T")
|
|
32 transitionTable["A","A"] = NA
|
|
33 transitionTable["C","C"] = NA
|
|
34 transitionTable["G","G"] = NA
|
|
35 transitionTable["T","T"] = NA
|
|
36 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
|
|
37 cat("0", file="n.txt")
|
|
38 stop("No data")
|
|
39 }
|
|
40
|
|
41 #print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
|
|
42
|
|
43 result = data.frame(x = 1:5, y = 1:5, z = 1:5)
|
|
44 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
|
|
45
|
|
46 cleanup_columns = c("FR1.IMGT.c.a",
|
|
47 "FR2.IMGT.g.t",
|
|
48 "CDR1.IMGT.Nb.of.nucleotides",
|
|
49 "CDR2.IMGT.t.a",
|
|
50 "FR1.IMGT.c.g",
|
|
51 "CDR1.IMGT.c.t",
|
|
52 "FR2.IMGT.a.c",
|
|
53 "FR2.IMGT.Nb.of.mutations",
|
|
54 "FR2.IMGT.g.c",
|
|
55 "FR2.IMGT.a.g",
|
|
56 "FR3.IMGT.t.a",
|
|
57 "FR3.IMGT.t.c",
|
|
58 "FR2.IMGT.g.a",
|
|
59 "FR3.IMGT.c.g",
|
|
60 "FR1.IMGT.Nb.of.mutations",
|
|
61 "CDR1.IMGT.g.a",
|
|
62 "CDR1.IMGT.t.g",
|
|
63 "CDR1.IMGT.g.c",
|
|
64 "CDR2.IMGT.Nb.of.nucleotides",
|
|
65 "FR2.IMGT.a.t",
|
|
66 "CDR1.IMGT.Nb.of.mutations",
|
|
67 "CDR1.IMGT.a.g",
|
|
68 "FR3.IMGT.a.c",
|
|
69 "FR1.IMGT.g.a",
|
|
70 "FR3.IMGT.a.g",
|
|
71 "FR1.IMGT.a.t",
|
|
72 "CDR2.IMGT.a.g",
|
|
73 "CDR2.IMGT.Nb.of.mutations",
|
|
74 "CDR2.IMGT.g.t",
|
|
75 "CDR2.IMGT.a.c",
|
|
76 "CDR1.IMGT.t.c",
|
|
77 "FR3.IMGT.g.c",
|
|
78 "FR1.IMGT.g.t",
|
|
79 "FR3.IMGT.g.t",
|
|
80 "CDR1.IMGT.a.t",
|
|
81 "FR1.IMGT.a.g",
|
|
82 "FR3.IMGT.a.t",
|
|
83 "FR3.IMGT.Nb.of.nucleotides",
|
|
84 "FR2.IMGT.t.c",
|
|
85 "CDR2.IMGT.g.a",
|
|
86 "FR2.IMGT.t.a",
|
|
87 "CDR1.IMGT.t.a",
|
|
88 "FR2.IMGT.t.g",
|
|
89 "FR3.IMGT.t.g",
|
|
90 "FR2.IMGT.Nb.of.nucleotides",
|
|
91 "FR1.IMGT.t.a",
|
|
92 "FR1.IMGT.t.g",
|
|
93 "FR3.IMGT.c.t",
|
|
94 "FR1.IMGT.t.c",
|
|
95 "CDR2.IMGT.a.t",
|
|
96 "FR2.IMGT.c.t",
|
|
97 "CDR1.IMGT.g.t",
|
|
98 "CDR2.IMGT.t.g",
|
|
99 "FR1.IMGT.Nb.of.nucleotides",
|
|
100 "CDR1.IMGT.c.g",
|
|
101 "CDR2.IMGT.t.c",
|
|
102 "FR3.IMGT.g.a",
|
|
103 "CDR1.IMGT.a.c",
|
|
104 "FR2.IMGT.c.a",
|
|
105 "FR3.IMGT.Nb.of.mutations",
|
|
106 "FR2.IMGT.c.g",
|
|
107 "CDR2.IMGT.g.c",
|
|
108 "FR1.IMGT.g.c",
|
|
109 "CDR2.IMGT.c.t",
|
|
110 "FR3.IMGT.c.a",
|
|
111 "CDR1.IMGT.c.a",
|
|
112 "CDR2.IMGT.c.g",
|
|
113 "CDR2.IMGT.c.a",
|
|
114 "FR1.IMGT.c.t")
|
|
115
|
|
116 for(col in cleanup_columns){
|
|
117 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
|
|
118 #dat[dat[,col] == "",] = "0"
|
|
119 dat[,col] = as.numeric(dat[,col])
|
|
120 dat[is.na(dat[,col]),] = 0
|
|
121 }
|
|
122
|
|
123 VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations +
|
|
124 dat$CDR1.IMGT.Nb.of.mutations +
|
|
125 dat$FR2.IMGT.Nb.of.mutations +
|
|
126 dat$CDR2.IMGT.Nb.of.mutations +
|
|
127 dat$FR3.IMGT.Nb.of.mutations)
|
|
128
|
|
129 VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
|
|
130 dat$CDR1.IMGT.Nb.of.nucleotides +
|
|
131 dat$FR2.IMGT.Nb.of.nucleotides +
|
|
132 dat$CDR2.IMGT.Nb.of.nucleotides +
|
|
133 dat$FR3.IMGT.Nb.of.nucleotides)
|
|
134
|
|
135 result[1,"x"] = VRegionMutations
|
|
136 result[1,"y"] = VRegionNucleotides
|
|
137 result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
|
|
138
|
|
139 transitionMutations = sum(dat$FR1.IMGT.a.g +
|
|
140 dat$FR1.IMGT.g.a +
|
|
141 dat$FR1.IMGT.c.t +
|
|
142 dat$FR1.IMGT.t.c +
|
|
143 dat$CDR1.IMGT.a.g +
|
|
144 dat$CDR1.IMGT.g.a +
|
|
145 dat$CDR1.IMGT.c.t +
|
|
146 dat$CDR1.IMGT.t.c +
|
|
147 dat$FR2.IMGT.a.g +
|
|
148 dat$FR2.IMGT.g.a +
|
|
149 dat$FR2.IMGT.c.t +
|
|
150 dat$FR2.IMGT.t.c +
|
|
151 dat$CDR2.IMGT.a.g +
|
|
152 dat$CDR2.IMGT.g.a +
|
|
153 dat$CDR2.IMGT.c.t +
|
|
154 dat$CDR2.IMGT.t.c +
|
|
155 dat$FR3.IMGT.a.g +
|
|
156 dat$FR3.IMGT.g.a +
|
|
157 dat$FR3.IMGT.c.t +
|
|
158 dat$FR3.IMGT.t.c)
|
|
159
|
|
160 result[2,"x"] = transitionMutations
|
|
161 result[2,"y"] = VRegionMutations
|
|
162 result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
|
|
163
|
|
164 transversionMutations = sum( dat$FR1.IMGT.a.c +
|
|
165 dat$FR1.IMGT.c.a +
|
|
166 dat$FR1.IMGT.a.t +
|
|
167 dat$FR1.IMGT.t.a +
|
|
168 dat$FR1.IMGT.g.c +
|
|
169 dat$FR1.IMGT.c.g +
|
|
170 dat$FR1.IMGT.g.t +
|
|
171 dat$FR1.IMGT.t.g +
|
|
172 dat$CDR1.IMGT.a.c +
|
|
173 dat$CDR1.IMGT.c.a +
|
|
174 dat$CDR1.IMGT.a.t +
|
|
175 dat$CDR1.IMGT.t.a +
|
|
176 dat$CDR1.IMGT.g.c +
|
|
177 dat$CDR1.IMGT.c.g +
|
|
178 dat$CDR1.IMGT.g.t +
|
|
179 dat$CDR1.IMGT.t.g +
|
|
180 dat$FR2.IMGT.a.c +
|
|
181 dat$FR2.IMGT.c.a +
|
|
182 dat$FR2.IMGT.a.t +
|
|
183 dat$FR2.IMGT.t.a +
|
|
184 dat$FR2.IMGT.g.c +
|
|
185 dat$FR2.IMGT.c.g +
|
|
186 dat$FR2.IMGT.g.t +
|
|
187 dat$FR2.IMGT.t.g +
|
|
188 dat$CDR2.IMGT.a.c +
|
|
189 dat$CDR2.IMGT.c.a +
|
|
190 dat$CDR2.IMGT.a.t +
|
|
191 dat$CDR2.IMGT.t.a +
|
|
192 dat$CDR2.IMGT.g.c +
|
|
193 dat$CDR2.IMGT.c.g +
|
|
194 dat$CDR2.IMGT.g.t +
|
|
195 dat$CDR2.IMGT.t.g +
|
|
196 dat$FR3.IMGT.a.c +
|
|
197 dat$FR3.IMGT.c.a +
|
|
198 dat$FR3.IMGT.a.t +
|
|
199 dat$FR3.IMGT.t.a +
|
|
200 dat$FR3.IMGT.g.c +
|
|
201 dat$FR3.IMGT.c.g +
|
|
202 dat$FR3.IMGT.g.t +
|
|
203 dat$FR3.IMGT.t.g)
|
|
204
|
|
205 result[3,"x"] = transversionMutations
|
|
206 result[3,"y"] = VRegionMutations
|
|
207 result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
|
|
208
|
|
209 transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
|
|
210 dat$FR1.IMGT.c.t +
|
|
211 dat$CDR1.IMGT.g.a +
|
|
212 dat$CDR1.IMGT.c.t +
|
|
213 dat$FR2.IMGT.g.a +
|
|
214 dat$FR2.IMGT.c.t +
|
|
215 dat$CDR2.IMGT.g.a +
|
|
216 dat$CDR2.IMGT.c.t +
|
|
217 dat$FR3.IMGT.g.a +
|
|
218 dat$FR3.IMGT.c.t)
|
|
219
|
|
220 totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
|
|
221 dat$FR1.IMGT.c.t +
|
|
222 dat$FR1.IMGT.c.a +
|
|
223 dat$FR1.IMGT.g.c +
|
|
224 dat$FR1.IMGT.c.g +
|
|
225 dat$FR1.IMGT.g.t +
|
|
226 dat$CDR1.IMGT.g.a +
|
|
227 dat$CDR1.IMGT.c.t +
|
|
228 dat$CDR1.IMGT.c.a +
|
|
229 dat$CDR1.IMGT.g.c +
|
|
230 dat$CDR1.IMGT.c.g +
|
|
231 dat$CDR1.IMGT.g.t +
|
|
232 dat$FR2.IMGT.g.a +
|
|
233 dat$FR2.IMGT.c.t +
|
|
234 dat$FR2.IMGT.c.a +
|
|
235 dat$FR2.IMGT.g.c +
|
|
236 dat$FR2.IMGT.c.g +
|
|
237 dat$FR2.IMGT.g.t +
|
|
238 dat$CDR2.IMGT.g.a +
|
|
239 dat$CDR2.IMGT.c.t +
|
|
240 dat$CDR2.IMGT.c.a +
|
|
241 dat$CDR2.IMGT.g.c +
|
|
242 dat$CDR2.IMGT.c.g +
|
|
243 dat$CDR2.IMGT.g.t +
|
|
244 dat$FR3.IMGT.g.a +
|
|
245 dat$FR3.IMGT.c.t +
|
|
246 dat$FR3.IMGT.c.a +
|
|
247 dat$FR3.IMGT.g.c +
|
|
248 dat$FR3.IMGT.c.g +
|
|
249 dat$FR3.IMGT.g.t)
|
|
250
|
|
251 result[4,"x"] = transitionMutationsAtGC
|
|
252 result[4,"y"] = totalMutationsAtGC
|
|
253 result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
|
|
254
|
|
255 result[5,"x"] = totalMutationsAtGC
|
|
256 result[5,"y"] = VRegionMutations
|
|
257 result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
|
|
258
|
|
259
|
|
260 #transitiontable
|
|
261
|
|
262 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
|
|
263 row.names(transitionTable) = c("A", "C", "G", "T")
|
|
264 transitionTable["A","A"] = NA
|
|
265 transitionTable["C","C"] = NA
|
|
266 transitionTable["G","G"] = NA
|
|
267 transitionTable["T","T"] = NA
|
|
268 nts = c("a", "c", "g", "t")
|
|
269
|
|
270
|
|
271 for(nt1 in nts){
|
|
272 for(nt2 in nts){
|
|
273 if(nt1 == nt2){
|
|
274 next
|
|
275 }
|
|
276 NT1 = LETTERS[letters == nt1]
|
|
277 NT2 = LETTERS[letters == nt2]
|
|
278 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
|
|
279 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
|
|
280 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
|
|
281 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
|
|
282 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
|
|
283 transitionTable[NT1,NT2] = sum( dat[,FR1] +
|
|
284 dat[,CDR1] +
|
|
285 dat[,FR2] +
|
|
286 dat[,CDR2] +
|
|
287 dat[,FR3])
|
|
288 }
|
|
289 }
|
|
290
|
|
291 setwd(outputdir)
|
|
292 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
|
|
293 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
|
|
294 cat(length(dat$Sequence.ID), file="n.txt")
|
|
295
|
|
296
|
|
297
|
|
298
|
|
299
|
|
300
|
|
301
|
|
302
|
|
303
|
|
304
|
|
305
|
|
306
|
|
307
|
|
308
|
|
309
|
|
310
|
|
311
|
|
312
|
|
313
|
|
314
|
|
315
|
|
316
|
|
317
|
|
318
|
|
319
|
|
320
|
|
321
|
|
322
|
|
323
|
|
324
|
|
325
|
|
326
|
|
327
|
|
328
|