Mercurial > repos > davidvanzessen > mutation_analysis
comparison mutation_analysis.r @ 0:74d2bc479bee draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 18 Aug 2014 04:04:37 -0400 |
| parents | |
| children | 2f4298673519 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:74d2bc479bee |
|---|---|
| 1 args <- commandArgs(trailingOnly = TRUE) | |
| 2 | |
| 3 input = args[1] | |
| 4 summaryinput = args[2] | |
| 5 outputdir = args[3] | |
| 6 setwd(outputdir) | |
| 7 | |
| 8 #dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) | |
| 9 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) | |
| 10 | |
| 11 datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) | |
| 12 datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")] | |
| 13 | |
| 14 dat = merge(dat, datSum, by="Sequence.ID", all.x=T) | |
| 15 | |
| 16 #dat = dat[dat$Functionality == "productive",] | |
| 17 | |
| 18 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) | |
| 19 dat$VGene = gsub("[*].*", "", dat$VGene) | |
| 20 | |
| 21 dat$past = paste(dat$AA.JUNCTION, dat$VGene) | |
| 22 | |
| 23 #dat = dat[!duplicated(dat$past), ] | |
| 24 | |
| 25 if(length(dat$Sequence.ID) == 0){ | |
| 26 setwd(outputdir) | |
| 27 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) | |
| 28 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") | |
| 29 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | |
| 30 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) | |
| 31 row.names(transitionTable) = c("A", "C", "G", "T") | |
| 32 transitionTable["A","A"] = NA | |
| 33 transitionTable["C","C"] = NA | |
| 34 transitionTable["G","G"] = NA | |
| 35 transitionTable["T","T"] = NA | |
| 36 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) | |
| 37 cat("0", file="n.txt") | |
| 38 stop("No data") | |
| 39 } | |
| 40 | |
| 41 #print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) | |
| 42 | |
| 43 result = data.frame(x = 1:5, y = 1:5, z = 1:5) | |
| 44 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") | |
| 45 | |
| 46 cleanup_columns = c("FR1.IMGT.c.a", | |
| 47 "FR2.IMGT.g.t", | |
| 48 "CDR1.IMGT.Nb.of.nucleotides", | |
| 49 "CDR2.IMGT.t.a", | |
| 50 "FR1.IMGT.c.g", | |
| 51 "CDR1.IMGT.c.t", | |
| 52 "FR2.IMGT.a.c", | |
| 53 "FR2.IMGT.Nb.of.mutations", | |
| 54 "FR2.IMGT.g.c", | |
| 55 "FR2.IMGT.a.g", | |
| 56 "FR3.IMGT.t.a", | |
| 57 "FR3.IMGT.t.c", | |
| 58 "FR2.IMGT.g.a", | |
| 59 "FR3.IMGT.c.g", | |
| 60 "FR1.IMGT.Nb.of.mutations", | |
| 61 "CDR1.IMGT.g.a", | |
| 62 "CDR1.IMGT.t.g", | |
| 63 "CDR1.IMGT.g.c", | |
| 64 "CDR2.IMGT.Nb.of.nucleotides", | |
| 65 "FR2.IMGT.a.t", | |
| 66 "CDR1.IMGT.Nb.of.mutations", | |
| 67 "CDR1.IMGT.a.g", | |
| 68 "FR3.IMGT.a.c", | |
| 69 "FR1.IMGT.g.a", | |
| 70 "FR3.IMGT.a.g", | |
| 71 "FR1.IMGT.a.t", | |
| 72 "CDR2.IMGT.a.g", | |
| 73 "CDR2.IMGT.Nb.of.mutations", | |
| 74 "CDR2.IMGT.g.t", | |
| 75 "CDR2.IMGT.a.c", | |
| 76 "CDR1.IMGT.t.c", | |
| 77 "FR3.IMGT.g.c", | |
| 78 "FR1.IMGT.g.t", | |
| 79 "FR3.IMGT.g.t", | |
| 80 "CDR1.IMGT.a.t", | |
| 81 "FR1.IMGT.a.g", | |
| 82 "FR3.IMGT.a.t", | |
| 83 "FR3.IMGT.Nb.of.nucleotides", | |
| 84 "FR2.IMGT.t.c", | |
| 85 "CDR2.IMGT.g.a", | |
| 86 "FR2.IMGT.t.a", | |
| 87 "CDR1.IMGT.t.a", | |
| 88 "FR2.IMGT.t.g", | |
| 89 "FR3.IMGT.t.g", | |
| 90 "FR2.IMGT.Nb.of.nucleotides", | |
| 91 "FR1.IMGT.t.a", | |
| 92 "FR1.IMGT.t.g", | |
| 93 "FR3.IMGT.c.t", | |
| 94 "FR1.IMGT.t.c", | |
| 95 "CDR2.IMGT.a.t", | |
| 96 "FR2.IMGT.c.t", | |
| 97 "CDR1.IMGT.g.t", | |
| 98 "CDR2.IMGT.t.g", | |
| 99 "FR1.IMGT.Nb.of.nucleotides", | |
| 100 "CDR1.IMGT.c.g", | |
| 101 "CDR2.IMGT.t.c", | |
| 102 "FR3.IMGT.g.a", | |
| 103 "CDR1.IMGT.a.c", | |
| 104 "FR2.IMGT.c.a", | |
| 105 "FR3.IMGT.Nb.of.mutations", | |
| 106 "FR2.IMGT.c.g", | |
| 107 "CDR2.IMGT.g.c", | |
| 108 "FR1.IMGT.g.c", | |
| 109 "CDR2.IMGT.c.t", | |
| 110 "FR3.IMGT.c.a", | |
| 111 "CDR1.IMGT.c.a", | |
| 112 "CDR2.IMGT.c.g", | |
| 113 "CDR2.IMGT.c.a", | |
| 114 "FR1.IMGT.c.t") | |
| 115 | |
| 116 for(col in cleanup_columns){ | |
| 117 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) | |
| 118 #dat[dat[,col] == "",] = "0" | |
| 119 dat[,col] = as.numeric(dat[,col]) | |
| 120 dat[is.na(dat[,col]),] = 0 | |
| 121 } | |
| 122 | |
| 123 VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + | |
| 124 dat$CDR1.IMGT.Nb.of.mutations + | |
| 125 dat$FR2.IMGT.Nb.of.mutations + | |
| 126 dat$CDR2.IMGT.Nb.of.mutations + | |
| 127 dat$FR3.IMGT.Nb.of.mutations) | |
| 128 | |
| 129 VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + | |
| 130 dat$CDR1.IMGT.Nb.of.nucleotides + | |
| 131 dat$FR2.IMGT.Nb.of.nucleotides + | |
| 132 dat$CDR2.IMGT.Nb.of.nucleotides + | |
| 133 dat$FR3.IMGT.Nb.of.nucleotides) | |
| 134 | |
| 135 result[1,"x"] = VRegionMutations | |
| 136 result[1,"y"] = VRegionNucleotides | |
| 137 result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) | |
| 138 | |
| 139 transitionMutations = sum(dat$FR1.IMGT.a.g + | |
| 140 dat$FR1.IMGT.g.a + | |
| 141 dat$FR1.IMGT.c.t + | |
| 142 dat$FR1.IMGT.t.c + | |
| 143 dat$CDR1.IMGT.a.g + | |
| 144 dat$CDR1.IMGT.g.a + | |
| 145 dat$CDR1.IMGT.c.t + | |
| 146 dat$CDR1.IMGT.t.c + | |
| 147 dat$FR2.IMGT.a.g + | |
| 148 dat$FR2.IMGT.g.a + | |
| 149 dat$FR2.IMGT.c.t + | |
| 150 dat$FR2.IMGT.t.c + | |
| 151 dat$CDR2.IMGT.a.g + | |
| 152 dat$CDR2.IMGT.g.a + | |
| 153 dat$CDR2.IMGT.c.t + | |
| 154 dat$CDR2.IMGT.t.c + | |
| 155 dat$FR3.IMGT.a.g + | |
| 156 dat$FR3.IMGT.g.a + | |
| 157 dat$FR3.IMGT.c.t + | |
| 158 dat$FR3.IMGT.t.c) | |
| 159 | |
| 160 result[2,"x"] = transitionMutations | |
| 161 result[2,"y"] = VRegionMutations | |
| 162 result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) | |
| 163 | |
| 164 transversionMutations = sum( dat$FR1.IMGT.a.c + | |
| 165 dat$FR1.IMGT.c.a + | |
| 166 dat$FR1.IMGT.a.t + | |
| 167 dat$FR1.IMGT.t.a + | |
| 168 dat$FR1.IMGT.g.c + | |
| 169 dat$FR1.IMGT.c.g + | |
| 170 dat$FR1.IMGT.g.t + | |
| 171 dat$FR1.IMGT.t.g + | |
| 172 dat$CDR1.IMGT.a.c + | |
| 173 dat$CDR1.IMGT.c.a + | |
| 174 dat$CDR1.IMGT.a.t + | |
| 175 dat$CDR1.IMGT.t.a + | |
| 176 dat$CDR1.IMGT.g.c + | |
| 177 dat$CDR1.IMGT.c.g + | |
| 178 dat$CDR1.IMGT.g.t + | |
| 179 dat$CDR1.IMGT.t.g + | |
| 180 dat$FR2.IMGT.a.c + | |
| 181 dat$FR2.IMGT.c.a + | |
| 182 dat$FR2.IMGT.a.t + | |
| 183 dat$FR2.IMGT.t.a + | |
| 184 dat$FR2.IMGT.g.c + | |
| 185 dat$FR2.IMGT.c.g + | |
| 186 dat$FR2.IMGT.g.t + | |
| 187 dat$FR2.IMGT.t.g + | |
| 188 dat$CDR2.IMGT.a.c + | |
| 189 dat$CDR2.IMGT.c.a + | |
| 190 dat$CDR2.IMGT.a.t + | |
| 191 dat$CDR2.IMGT.t.a + | |
| 192 dat$CDR2.IMGT.g.c + | |
| 193 dat$CDR2.IMGT.c.g + | |
| 194 dat$CDR2.IMGT.g.t + | |
| 195 dat$CDR2.IMGT.t.g + | |
| 196 dat$FR3.IMGT.a.c + | |
| 197 dat$FR3.IMGT.c.a + | |
| 198 dat$FR3.IMGT.a.t + | |
| 199 dat$FR3.IMGT.t.a + | |
| 200 dat$FR3.IMGT.g.c + | |
| 201 dat$FR3.IMGT.c.g + | |
| 202 dat$FR3.IMGT.g.t + | |
| 203 dat$FR3.IMGT.t.g) | |
| 204 | |
| 205 result[3,"x"] = transversionMutations | |
| 206 result[3,"y"] = VRegionMutations | |
| 207 result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) | |
| 208 | |
| 209 transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + | |
| 210 dat$FR1.IMGT.c.t + | |
| 211 dat$CDR1.IMGT.g.a + | |
| 212 dat$CDR1.IMGT.c.t + | |
| 213 dat$FR2.IMGT.g.a + | |
| 214 dat$FR2.IMGT.c.t + | |
| 215 dat$CDR2.IMGT.g.a + | |
| 216 dat$CDR2.IMGT.c.t + | |
| 217 dat$FR3.IMGT.g.a + | |
| 218 dat$FR3.IMGT.c.t) | |
| 219 | |
| 220 totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + | |
| 221 dat$FR1.IMGT.c.t + | |
| 222 dat$FR1.IMGT.c.a + | |
| 223 dat$FR1.IMGT.g.c + | |
| 224 dat$FR1.IMGT.c.g + | |
| 225 dat$FR1.IMGT.g.t + | |
| 226 dat$CDR1.IMGT.g.a + | |
| 227 dat$CDR1.IMGT.c.t + | |
| 228 dat$CDR1.IMGT.c.a + | |
| 229 dat$CDR1.IMGT.g.c + | |
| 230 dat$CDR1.IMGT.c.g + | |
| 231 dat$CDR1.IMGT.g.t + | |
| 232 dat$FR2.IMGT.g.a + | |
| 233 dat$FR2.IMGT.c.t + | |
| 234 dat$FR2.IMGT.c.a + | |
| 235 dat$FR2.IMGT.g.c + | |
| 236 dat$FR2.IMGT.c.g + | |
| 237 dat$FR2.IMGT.g.t + | |
| 238 dat$CDR2.IMGT.g.a + | |
| 239 dat$CDR2.IMGT.c.t + | |
| 240 dat$CDR2.IMGT.c.a + | |
| 241 dat$CDR2.IMGT.g.c + | |
| 242 dat$CDR2.IMGT.c.g + | |
| 243 dat$CDR2.IMGT.g.t + | |
| 244 dat$FR3.IMGT.g.a + | |
| 245 dat$FR3.IMGT.c.t + | |
| 246 dat$FR3.IMGT.c.a + | |
| 247 dat$FR3.IMGT.g.c + | |
| 248 dat$FR3.IMGT.c.g + | |
| 249 dat$FR3.IMGT.g.t) | |
| 250 | |
| 251 result[4,"x"] = transitionMutationsAtGC | |
| 252 result[4,"y"] = totalMutationsAtGC | |
| 253 result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) | |
| 254 | |
| 255 result[5,"x"] = totalMutationsAtGC | |
| 256 result[5,"y"] = VRegionMutations | |
| 257 result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) | |
| 258 | |
| 259 | |
| 260 #transitiontable | |
| 261 | |
| 262 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) | |
| 263 row.names(transitionTable) = c("A", "C", "G", "T") | |
| 264 transitionTable["A","A"] = NA | |
| 265 transitionTable["C","C"] = NA | |
| 266 transitionTable["G","G"] = NA | |
| 267 transitionTable["T","T"] = NA | |
| 268 nts = c("a", "c", "g", "t") | |
| 269 | |
| 270 | |
| 271 for(nt1 in nts){ | |
| 272 for(nt2 in nts){ | |
| 273 if(nt1 == nt2){ | |
| 274 next | |
| 275 } | |
| 276 NT1 = LETTERS[letters == nt1] | |
| 277 NT2 = LETTERS[letters == nt2] | |
| 278 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") | |
| 279 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") | |
| 280 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") | |
| 281 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") | |
| 282 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") | |
| 283 transitionTable[NT1,NT2] = sum( dat[,FR1] + | |
| 284 dat[,CDR1] + | |
| 285 dat[,FR2] + | |
| 286 dat[,CDR2] + | |
| 287 dat[,FR3]) | |
| 288 } | |
| 289 } | |
| 290 | |
| 291 setwd(outputdir) | |
| 292 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) | |
| 293 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) | |
| 294 cat(length(dat$Sequence.ID), file="n.txt") | |
| 295 | |
| 296 | |
| 297 | |
| 298 | |
| 299 | |
| 300 | |
| 301 | |
| 302 | |
| 303 | |
| 304 | |
| 305 | |
| 306 | |
| 307 | |
| 308 | |
| 309 | |
| 310 | |
| 311 | |
| 312 | |
| 313 | |
| 314 | |
| 315 | |
| 316 | |
| 317 | |
| 318 | |
| 319 | |
| 320 | |
| 321 | |
| 322 | |
| 323 | |
| 324 | |
| 325 | |
| 326 | |
| 327 | |
| 328 |
