Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.r @ 0:74d2bc479bee draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 18 Aug 2014 04:04:37 -0400 |
parents | |
children | 2f4298673519 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:74d2bc479bee |
---|---|
1 args <- commandArgs(trailingOnly = TRUE) | |
2 | |
3 input = args[1] | |
4 summaryinput = args[2] | |
5 outputdir = args[3] | |
6 setwd(outputdir) | |
7 | |
8 #dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) | |
9 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) | |
10 | |
11 datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) | |
12 datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")] | |
13 | |
14 dat = merge(dat, datSum, by="Sequence.ID", all.x=T) | |
15 | |
16 #dat = dat[dat$Functionality == "productive",] | |
17 | |
18 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) | |
19 dat$VGene = gsub("[*].*", "", dat$VGene) | |
20 | |
21 dat$past = paste(dat$AA.JUNCTION, dat$VGene) | |
22 | |
23 #dat = dat[!duplicated(dat$past), ] | |
24 | |
25 if(length(dat$Sequence.ID) == 0){ | |
26 setwd(outputdir) | |
27 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) | |
28 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") | |
29 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | |
30 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) | |
31 row.names(transitionTable) = c("A", "C", "G", "T") | |
32 transitionTable["A","A"] = NA | |
33 transitionTable["C","C"] = NA | |
34 transitionTable["G","G"] = NA | |
35 transitionTable["T","T"] = NA | |
36 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) | |
37 cat("0", file="n.txt") | |
38 stop("No data") | |
39 } | |
40 | |
41 #print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) | |
42 | |
43 result = data.frame(x = 1:5, y = 1:5, z = 1:5) | |
44 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") | |
45 | |
46 cleanup_columns = c("FR1.IMGT.c.a", | |
47 "FR2.IMGT.g.t", | |
48 "CDR1.IMGT.Nb.of.nucleotides", | |
49 "CDR2.IMGT.t.a", | |
50 "FR1.IMGT.c.g", | |
51 "CDR1.IMGT.c.t", | |
52 "FR2.IMGT.a.c", | |
53 "FR2.IMGT.Nb.of.mutations", | |
54 "FR2.IMGT.g.c", | |
55 "FR2.IMGT.a.g", | |
56 "FR3.IMGT.t.a", | |
57 "FR3.IMGT.t.c", | |
58 "FR2.IMGT.g.a", | |
59 "FR3.IMGT.c.g", | |
60 "FR1.IMGT.Nb.of.mutations", | |
61 "CDR1.IMGT.g.a", | |
62 "CDR1.IMGT.t.g", | |
63 "CDR1.IMGT.g.c", | |
64 "CDR2.IMGT.Nb.of.nucleotides", | |
65 "FR2.IMGT.a.t", | |
66 "CDR1.IMGT.Nb.of.mutations", | |
67 "CDR1.IMGT.a.g", | |
68 "FR3.IMGT.a.c", | |
69 "FR1.IMGT.g.a", | |
70 "FR3.IMGT.a.g", | |
71 "FR1.IMGT.a.t", | |
72 "CDR2.IMGT.a.g", | |
73 "CDR2.IMGT.Nb.of.mutations", | |
74 "CDR2.IMGT.g.t", | |
75 "CDR2.IMGT.a.c", | |
76 "CDR1.IMGT.t.c", | |
77 "FR3.IMGT.g.c", | |
78 "FR1.IMGT.g.t", | |
79 "FR3.IMGT.g.t", | |
80 "CDR1.IMGT.a.t", | |
81 "FR1.IMGT.a.g", | |
82 "FR3.IMGT.a.t", | |
83 "FR3.IMGT.Nb.of.nucleotides", | |
84 "FR2.IMGT.t.c", | |
85 "CDR2.IMGT.g.a", | |
86 "FR2.IMGT.t.a", | |
87 "CDR1.IMGT.t.a", | |
88 "FR2.IMGT.t.g", | |
89 "FR3.IMGT.t.g", | |
90 "FR2.IMGT.Nb.of.nucleotides", | |
91 "FR1.IMGT.t.a", | |
92 "FR1.IMGT.t.g", | |
93 "FR3.IMGT.c.t", | |
94 "FR1.IMGT.t.c", | |
95 "CDR2.IMGT.a.t", | |
96 "FR2.IMGT.c.t", | |
97 "CDR1.IMGT.g.t", | |
98 "CDR2.IMGT.t.g", | |
99 "FR1.IMGT.Nb.of.nucleotides", | |
100 "CDR1.IMGT.c.g", | |
101 "CDR2.IMGT.t.c", | |
102 "FR3.IMGT.g.a", | |
103 "CDR1.IMGT.a.c", | |
104 "FR2.IMGT.c.a", | |
105 "FR3.IMGT.Nb.of.mutations", | |
106 "FR2.IMGT.c.g", | |
107 "CDR2.IMGT.g.c", | |
108 "FR1.IMGT.g.c", | |
109 "CDR2.IMGT.c.t", | |
110 "FR3.IMGT.c.a", | |
111 "CDR1.IMGT.c.a", | |
112 "CDR2.IMGT.c.g", | |
113 "CDR2.IMGT.c.a", | |
114 "FR1.IMGT.c.t") | |
115 | |
116 for(col in cleanup_columns){ | |
117 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) | |
118 #dat[dat[,col] == "",] = "0" | |
119 dat[,col] = as.numeric(dat[,col]) | |
120 dat[is.na(dat[,col]),] = 0 | |
121 } | |
122 | |
123 VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + | |
124 dat$CDR1.IMGT.Nb.of.mutations + | |
125 dat$FR2.IMGT.Nb.of.mutations + | |
126 dat$CDR2.IMGT.Nb.of.mutations + | |
127 dat$FR3.IMGT.Nb.of.mutations) | |
128 | |
129 VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + | |
130 dat$CDR1.IMGT.Nb.of.nucleotides + | |
131 dat$FR2.IMGT.Nb.of.nucleotides + | |
132 dat$CDR2.IMGT.Nb.of.nucleotides + | |
133 dat$FR3.IMGT.Nb.of.nucleotides) | |
134 | |
135 result[1,"x"] = VRegionMutations | |
136 result[1,"y"] = VRegionNucleotides | |
137 result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) | |
138 | |
139 transitionMutations = sum(dat$FR1.IMGT.a.g + | |
140 dat$FR1.IMGT.g.a + | |
141 dat$FR1.IMGT.c.t + | |
142 dat$FR1.IMGT.t.c + | |
143 dat$CDR1.IMGT.a.g + | |
144 dat$CDR1.IMGT.g.a + | |
145 dat$CDR1.IMGT.c.t + | |
146 dat$CDR1.IMGT.t.c + | |
147 dat$FR2.IMGT.a.g + | |
148 dat$FR2.IMGT.g.a + | |
149 dat$FR2.IMGT.c.t + | |
150 dat$FR2.IMGT.t.c + | |
151 dat$CDR2.IMGT.a.g + | |
152 dat$CDR2.IMGT.g.a + | |
153 dat$CDR2.IMGT.c.t + | |
154 dat$CDR2.IMGT.t.c + | |
155 dat$FR3.IMGT.a.g + | |
156 dat$FR3.IMGT.g.a + | |
157 dat$FR3.IMGT.c.t + | |
158 dat$FR3.IMGT.t.c) | |
159 | |
160 result[2,"x"] = transitionMutations | |
161 result[2,"y"] = VRegionMutations | |
162 result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) | |
163 | |
164 transversionMutations = sum( dat$FR1.IMGT.a.c + | |
165 dat$FR1.IMGT.c.a + | |
166 dat$FR1.IMGT.a.t + | |
167 dat$FR1.IMGT.t.a + | |
168 dat$FR1.IMGT.g.c + | |
169 dat$FR1.IMGT.c.g + | |
170 dat$FR1.IMGT.g.t + | |
171 dat$FR1.IMGT.t.g + | |
172 dat$CDR1.IMGT.a.c + | |
173 dat$CDR1.IMGT.c.a + | |
174 dat$CDR1.IMGT.a.t + | |
175 dat$CDR1.IMGT.t.a + | |
176 dat$CDR1.IMGT.g.c + | |
177 dat$CDR1.IMGT.c.g + | |
178 dat$CDR1.IMGT.g.t + | |
179 dat$CDR1.IMGT.t.g + | |
180 dat$FR2.IMGT.a.c + | |
181 dat$FR2.IMGT.c.a + | |
182 dat$FR2.IMGT.a.t + | |
183 dat$FR2.IMGT.t.a + | |
184 dat$FR2.IMGT.g.c + | |
185 dat$FR2.IMGT.c.g + | |
186 dat$FR2.IMGT.g.t + | |
187 dat$FR2.IMGT.t.g + | |
188 dat$CDR2.IMGT.a.c + | |
189 dat$CDR2.IMGT.c.a + | |
190 dat$CDR2.IMGT.a.t + | |
191 dat$CDR2.IMGT.t.a + | |
192 dat$CDR2.IMGT.g.c + | |
193 dat$CDR2.IMGT.c.g + | |
194 dat$CDR2.IMGT.g.t + | |
195 dat$CDR2.IMGT.t.g + | |
196 dat$FR3.IMGT.a.c + | |
197 dat$FR3.IMGT.c.a + | |
198 dat$FR3.IMGT.a.t + | |
199 dat$FR3.IMGT.t.a + | |
200 dat$FR3.IMGT.g.c + | |
201 dat$FR3.IMGT.c.g + | |
202 dat$FR3.IMGT.g.t + | |
203 dat$FR3.IMGT.t.g) | |
204 | |
205 result[3,"x"] = transversionMutations | |
206 result[3,"y"] = VRegionMutations | |
207 result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) | |
208 | |
209 transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + | |
210 dat$FR1.IMGT.c.t + | |
211 dat$CDR1.IMGT.g.a + | |
212 dat$CDR1.IMGT.c.t + | |
213 dat$FR2.IMGT.g.a + | |
214 dat$FR2.IMGT.c.t + | |
215 dat$CDR2.IMGT.g.a + | |
216 dat$CDR2.IMGT.c.t + | |
217 dat$FR3.IMGT.g.a + | |
218 dat$FR3.IMGT.c.t) | |
219 | |
220 totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + | |
221 dat$FR1.IMGT.c.t + | |
222 dat$FR1.IMGT.c.a + | |
223 dat$FR1.IMGT.g.c + | |
224 dat$FR1.IMGT.c.g + | |
225 dat$FR1.IMGT.g.t + | |
226 dat$CDR1.IMGT.g.a + | |
227 dat$CDR1.IMGT.c.t + | |
228 dat$CDR1.IMGT.c.a + | |
229 dat$CDR1.IMGT.g.c + | |
230 dat$CDR1.IMGT.c.g + | |
231 dat$CDR1.IMGT.g.t + | |
232 dat$FR2.IMGT.g.a + | |
233 dat$FR2.IMGT.c.t + | |
234 dat$FR2.IMGT.c.a + | |
235 dat$FR2.IMGT.g.c + | |
236 dat$FR2.IMGT.c.g + | |
237 dat$FR2.IMGT.g.t + | |
238 dat$CDR2.IMGT.g.a + | |
239 dat$CDR2.IMGT.c.t + | |
240 dat$CDR2.IMGT.c.a + | |
241 dat$CDR2.IMGT.g.c + | |
242 dat$CDR2.IMGT.c.g + | |
243 dat$CDR2.IMGT.g.t + | |
244 dat$FR3.IMGT.g.a + | |
245 dat$FR3.IMGT.c.t + | |
246 dat$FR3.IMGT.c.a + | |
247 dat$FR3.IMGT.g.c + | |
248 dat$FR3.IMGT.c.g + | |
249 dat$FR3.IMGT.g.t) | |
250 | |
251 result[4,"x"] = transitionMutationsAtGC | |
252 result[4,"y"] = totalMutationsAtGC | |
253 result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) | |
254 | |
255 result[5,"x"] = totalMutationsAtGC | |
256 result[5,"y"] = VRegionMutations | |
257 result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) | |
258 | |
259 | |
260 #transitiontable | |
261 | |
262 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) | |
263 row.names(transitionTable) = c("A", "C", "G", "T") | |
264 transitionTable["A","A"] = NA | |
265 transitionTable["C","C"] = NA | |
266 transitionTable["G","G"] = NA | |
267 transitionTable["T","T"] = NA | |
268 nts = c("a", "c", "g", "t") | |
269 | |
270 | |
271 for(nt1 in nts){ | |
272 for(nt2 in nts){ | |
273 if(nt1 == nt2){ | |
274 next | |
275 } | |
276 NT1 = LETTERS[letters == nt1] | |
277 NT2 = LETTERS[letters == nt2] | |
278 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") | |
279 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") | |
280 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") | |
281 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") | |
282 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") | |
283 transitionTable[NT1,NT2] = sum( dat[,FR1] + | |
284 dat[,CDR1] + | |
285 dat[,FR2] + | |
286 dat[,CDR2] + | |
287 dat[,FR3]) | |
288 } | |
289 } | |
290 | |
291 setwd(outputdir) | |
292 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | |
293 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) | |
294 cat(length(dat$Sequence.ID), file="n.txt") | |
295 | |
296 | |
297 | |
298 | |
299 | |
300 | |
301 | |
302 | |
303 | |
304 | |
305 | |
306 | |
307 | |
308 | |
309 | |
310 | |
311 | |
312 | |
313 | |
314 | |
315 | |
316 | |
317 | |
318 | |
319 | |
320 | |
321 | |
322 | |
323 | |
324 | |
325 | |
326 | |
327 | |
328 |