diff mutation_analysis.r @ 0:74d2bc479bee draft

Uploaded
author davidvanzessen
date Mon, 18 Aug 2014 04:04:37 -0400
parents
children 2f4298673519
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.r	Mon Aug 18 04:04:37 2014 -0400
@@ -0,0 +1,328 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input = args[1]
+summaryinput = args[2]
+outputdir = args[3]
+setwd(outputdir)
+
+#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
+dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
+datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")]
+
+dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
+
+#dat = dat[dat$Functionality == "productive",]
+
+dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
+dat$VGene = gsub("[*].*", "", dat$VGene) 
+
+dat$past = paste(dat$AA.JUNCTION, dat$VGene)
+
+#dat = dat[!duplicated(dat$past), ]
+
+if(length(dat$Sequence.ID) == 0){
+	setwd(outputdir)
+	result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
+	row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+	write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+	transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
+	row.names(transitionTable) = c("A", "C", "G", "T")
+	transitionTable["A","A"] = NA
+	transitionTable["C","C"] = NA
+	transitionTable["G","G"] = NA
+	transitionTable["T","T"] = NA
+	write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+	cat("0", file="n.txt")
+	stop("No data")
+}
+
+#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
+
+result = data.frame(x = 1:5, y = 1:5, z = 1:5)
+row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+
+cleanup_columns = c("FR1.IMGT.c.a",
+                    "FR2.IMGT.g.t",
+                    "CDR1.IMGT.Nb.of.nucleotides",
+                    "CDR2.IMGT.t.a",
+                    "FR1.IMGT.c.g",
+                    "CDR1.IMGT.c.t",
+                    "FR2.IMGT.a.c",
+                    "FR2.IMGT.Nb.of.mutations",
+                    "FR2.IMGT.g.c",
+                    "FR2.IMGT.a.g",
+                    "FR3.IMGT.t.a",
+                    "FR3.IMGT.t.c",
+                    "FR2.IMGT.g.a",
+                    "FR3.IMGT.c.g",
+                    "FR1.IMGT.Nb.of.mutations",
+                    "CDR1.IMGT.g.a",
+                    "CDR1.IMGT.t.g",
+                    "CDR1.IMGT.g.c",
+                    "CDR2.IMGT.Nb.of.nucleotides",
+                    "FR2.IMGT.a.t",
+                    "CDR1.IMGT.Nb.of.mutations",
+                    "CDR1.IMGT.a.g",
+                    "FR3.IMGT.a.c",
+                    "FR1.IMGT.g.a",
+                    "FR3.IMGT.a.g",
+                    "FR1.IMGT.a.t",
+                    "CDR2.IMGT.a.g",
+                    "CDR2.IMGT.Nb.of.mutations",
+                    "CDR2.IMGT.g.t",
+                    "CDR2.IMGT.a.c",
+                    "CDR1.IMGT.t.c",
+                    "FR3.IMGT.g.c",
+                    "FR1.IMGT.g.t",
+                    "FR3.IMGT.g.t",
+                    "CDR1.IMGT.a.t",
+                    "FR1.IMGT.a.g",
+                    "FR3.IMGT.a.t",
+                    "FR3.IMGT.Nb.of.nucleotides",
+                    "FR2.IMGT.t.c",
+                    "CDR2.IMGT.g.a",
+                    "FR2.IMGT.t.a",
+                    "CDR1.IMGT.t.a",
+                    "FR2.IMGT.t.g",
+                    "FR3.IMGT.t.g",
+                    "FR2.IMGT.Nb.of.nucleotides",
+                    "FR1.IMGT.t.a",
+                    "FR1.IMGT.t.g",
+                    "FR3.IMGT.c.t",
+                    "FR1.IMGT.t.c",
+                    "CDR2.IMGT.a.t",
+                    "FR2.IMGT.c.t",
+                    "CDR1.IMGT.g.t",
+                    "CDR2.IMGT.t.g",
+                    "FR1.IMGT.Nb.of.nucleotides",
+                    "CDR1.IMGT.c.g",
+                    "CDR2.IMGT.t.c",
+                    "FR3.IMGT.g.a",
+                    "CDR1.IMGT.a.c",
+                    "FR2.IMGT.c.a",
+                    "FR3.IMGT.Nb.of.mutations",
+                    "FR2.IMGT.c.g",
+                    "CDR2.IMGT.g.c",
+                    "FR1.IMGT.g.c",
+                    "CDR2.IMGT.c.t",
+                    "FR3.IMGT.c.a",
+                    "CDR1.IMGT.c.a",
+                    "CDR2.IMGT.c.g",
+                    "CDR2.IMGT.c.a",
+                    "FR1.IMGT.c.t")
+
+for(col in cleanup_columns){
+  dat[,col] = gsub("\\(.*\\)", "", dat[,col])
+  #dat[dat[,col] == "",] = "0"
+  dat[,col] = as.numeric(dat[,col])
+  dat[is.na(dat[,col]),] = 0
+}
+
+VRegionMutations =  sum(dat$FR1.IMGT.Nb.of.mutations + 
+                        dat$CDR1.IMGT.Nb.of.mutations + 
+                        dat$FR2.IMGT.Nb.of.mutations + 
+                        dat$CDR2.IMGT.Nb.of.mutations +
+                        dat$FR3.IMGT.Nb.of.mutations)
+
+VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
+                          dat$CDR1.IMGT.Nb.of.nucleotides +
+                          dat$FR2.IMGT.Nb.of.nucleotides +
+                          dat$CDR2.IMGT.Nb.of.nucleotides +
+                          dat$FR3.IMGT.Nb.of.nucleotides)
+
+result[1,"x"] = VRegionMutations
+result[1,"y"] = VRegionNucleotides
+result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
+
+transitionMutations = sum(dat$FR1.IMGT.a.g +
+                          dat$FR1.IMGT.g.a +
+                          dat$FR1.IMGT.c.t +
+                          dat$FR1.IMGT.t.c +
+                          dat$CDR1.IMGT.a.g +
+                          dat$CDR1.IMGT.g.a +
+                          dat$CDR1.IMGT.c.t +
+                          dat$CDR1.IMGT.t.c +
+                          dat$FR2.IMGT.a.g +
+                          dat$FR2.IMGT.g.a +
+                          dat$FR2.IMGT.c.t +
+                          dat$FR2.IMGT.t.c +
+                          dat$CDR2.IMGT.a.g +
+                          dat$CDR2.IMGT.g.a +
+                          dat$CDR2.IMGT.c.t +
+                          dat$CDR2.IMGT.t.c +
+                          dat$FR3.IMGT.a.g +
+                          dat$FR3.IMGT.g.a +
+                          dat$FR3.IMGT.c.t +
+                          dat$FR3.IMGT.t.c)
+
+result[2,"x"] = transitionMutations
+result[2,"y"] = VRegionMutations
+result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
+
+transversionMutations = sum(  dat$FR1.IMGT.a.c +
+                              dat$FR1.IMGT.c.a +
+                              dat$FR1.IMGT.a.t +
+                              dat$FR1.IMGT.t.a +
+                              dat$FR1.IMGT.g.c +
+                              dat$FR1.IMGT.c.g +
+                              dat$FR1.IMGT.g.t +
+                              dat$FR1.IMGT.t.g +
+                              dat$CDR1.IMGT.a.c +
+                              dat$CDR1.IMGT.c.a +
+                              dat$CDR1.IMGT.a.t +
+                              dat$CDR1.IMGT.t.a +
+                              dat$CDR1.IMGT.g.c +
+                              dat$CDR1.IMGT.c.g +
+                              dat$CDR1.IMGT.g.t +
+                              dat$CDR1.IMGT.t.g +
+                              dat$FR2.IMGT.a.c +
+                              dat$FR2.IMGT.c.a +
+                              dat$FR2.IMGT.a.t +
+                              dat$FR2.IMGT.t.a +
+                              dat$FR2.IMGT.g.c +
+                              dat$FR2.IMGT.c.g +
+                              dat$FR2.IMGT.g.t +
+                              dat$FR2.IMGT.t.g +
+                              dat$CDR2.IMGT.a.c +
+                              dat$CDR2.IMGT.c.a +
+                              dat$CDR2.IMGT.a.t +
+                              dat$CDR2.IMGT.t.a +
+                              dat$CDR2.IMGT.g.c +
+                              dat$CDR2.IMGT.c.g +
+                              dat$CDR2.IMGT.g.t +
+                              dat$CDR2.IMGT.t.g +
+                              dat$FR3.IMGT.a.c +
+                              dat$FR3.IMGT.c.a +
+                              dat$FR3.IMGT.a.t +
+                              dat$FR3.IMGT.t.a +
+                              dat$FR3.IMGT.g.c +
+                              dat$FR3.IMGT.c.g +
+                              dat$FR3.IMGT.g.t +
+                              dat$FR3.IMGT.t.g)
+
+result[3,"x"] = transversionMutations
+result[3,"y"] = VRegionMutations
+result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
+
+transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+                              dat$FR1.IMGT.c.t +
+                              dat$CDR1.IMGT.g.a +
+                              dat$CDR1.IMGT.c.t +
+                              dat$FR2.IMGT.g.a +
+                              dat$FR2.IMGT.c.t +
+                              dat$CDR2.IMGT.g.a +
+                              dat$CDR2.IMGT.c.t +
+                              dat$FR3.IMGT.g.a +
+                              dat$FR3.IMGT.c.t)
+
+totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+                         dat$FR1.IMGT.c.t +
+                         dat$FR1.IMGT.c.a +
+                         dat$FR1.IMGT.g.c +
+                         dat$FR1.IMGT.c.g +
+                         dat$FR1.IMGT.g.t +
+                         dat$CDR1.IMGT.g.a +
+                         dat$CDR1.IMGT.c.t +
+                         dat$CDR1.IMGT.c.a +
+                         dat$CDR1.IMGT.g.c +
+                         dat$CDR1.IMGT.c.g +
+                         dat$CDR1.IMGT.g.t +
+                         dat$FR2.IMGT.g.a +
+                         dat$FR2.IMGT.c.t +
+                         dat$FR2.IMGT.c.a +
+                         dat$FR2.IMGT.g.c +
+                         dat$FR2.IMGT.c.g +
+                         dat$FR2.IMGT.g.t +
+                         dat$CDR2.IMGT.g.a +
+                         dat$CDR2.IMGT.c.t +
+                         dat$CDR2.IMGT.c.a +
+                         dat$CDR2.IMGT.g.c +
+                         dat$CDR2.IMGT.c.g +
+                         dat$CDR2.IMGT.g.t +
+                         dat$FR3.IMGT.g.a +
+                         dat$FR3.IMGT.c.t +
+                         dat$FR3.IMGT.c.a +
+                         dat$FR3.IMGT.g.c +
+                         dat$FR3.IMGT.c.g +
+                         dat$FR3.IMGT.g.t)
+
+result[4,"x"] = transitionMutationsAtGC
+result[4,"y"] = totalMutationsAtGC
+result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
+
+result[5,"x"] = totalMutationsAtGC
+result[5,"y"] = VRegionMutations
+result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
+
+
+#transitiontable
+
+transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
+row.names(transitionTable) = c("A", "C", "G", "T")
+transitionTable["A","A"] = NA
+transitionTable["C","C"] = NA
+transitionTable["G","G"] = NA
+transitionTable["T","T"] = NA
+nts = c("a", "c", "g", "t")
+
+
+for(nt1 in nts){
+  for(nt2 in nts){
+    if(nt1 == nt2){
+      next
+    }
+    NT1 = LETTERS[letters == nt1]
+    NT2 = LETTERS[letters == nt2]
+    FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+    CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+    FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+    CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+    FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+    transitionTable[NT1,NT2] = sum( dat[,FR1] + 
+                                    dat[,CDR1] +
+                                    dat[,FR2] +
+                                    dat[,CDR2] +
+                                    dat[,FR3])
+  }
+}
+
+setwd(outputdir)
+write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+cat(length(dat$Sequence.ID), file="n.txt")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+