Mercurial > repos > davidvanzessen > mutation_analysis
diff mutation_analysis.r @ 4:069419cccba4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 22 Sep 2014 10:19:36 -0400 |
parents | a0b27058dcac |
children | cb7c65e3e43f |
line wrap: on
line diff
--- a/mutation_analysis.r Wed Sep 17 07:25:17 2014 -0400 +++ b/mutation_analysis.r Mon Sep 22 10:19:36 2014 -0400 @@ -1,38 +1,29 @@ args <- commandArgs(trailingOnly = TRUE) input = args[1] -summaryinput = args[2] +genes = unlist(strsplit(args[2], ",")) outputdir = args[3] setwd(outputdir) -#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) -datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) -datSum = datSum[,c("Sequence.ID","J.GENE.and.allele", "AA.JUNCTION")] - -dat = merge(dat, datSum, by="Sequence.ID", all.x=T) - if(length(dat$Sequence.ID) == 0){ - setwd(outputdir) - result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") - write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) - transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) - row.names(transitionTable) = c("A", "C", "G", "T") - transitionTable["A","A"] = NA - transitionTable["C","C"] = NA - transitionTable["G","G"] = NA - transitionTable["T","T"] = NA - write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) - cat("0", file="n.txt") - stop("No data") + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") } -#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) -result = data.frame(x = 1:5, y = 1:5, z = 1:5) -row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") cleanup_columns = c("FR1.IMGT.c.a", "FR2.IMGT.g.t", @@ -111,32 +102,19 @@ dat[is.na(dat[,col]),] = 0 } -dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) -dat$VGene = gsub("[*].*", "", dat$VGene) -dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) -dat$JGene = gsub("[*].*", "", dat$JGene) - -dat$past = paste(dat$AA.JUNCTION, dat$VGene, dat$JGene, (dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + dat$FR3.IMGT.Nb.of.mutations)) - -dat = dat[!duplicated(dat$past), ] - -VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + +dat$VRegionMutations = dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + - dat$FR3.IMGT.Nb.of.mutations) + dat$FR3.IMGT.Nb.of.mutations -VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + +dat$VRegionNucleotides = dat$FR1.IMGT.Nb.of.nucleotides + dat$CDR1.IMGT.Nb.of.nucleotides + dat$FR2.IMGT.Nb.of.nucleotides + dat$CDR2.IMGT.Nb.of.nucleotides + - dat$FR3.IMGT.Nb.of.nucleotides) + dat$FR3.IMGT.Nb.of.nucleotides -result[1,"x"] = VRegionMutations -result[1,"y"] = VRegionNucleotides -result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) - -transitionMutations = sum(dat$FR1.IMGT.a.g + +dat$transitionMutations = dat$FR1.IMGT.a.g + dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$FR1.IMGT.t.c + @@ -155,58 +133,51 @@ dat$FR3.IMGT.a.g + dat$FR3.IMGT.g.a + dat$FR3.IMGT.c.t + - dat$FR3.IMGT.t.c) - -result[2,"x"] = transitionMutations -result[2,"y"] = VRegionMutations -result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) + dat$FR3.IMGT.t.c -transversionMutations = sum( dat$FR1.IMGT.a.c + - dat$FR1.IMGT.c.a + - dat$FR1.IMGT.a.t + - dat$FR1.IMGT.t.a + - dat$FR1.IMGT.g.c + - dat$FR1.IMGT.c.g + - dat$FR1.IMGT.g.t + - dat$FR1.IMGT.t.g + - dat$CDR1.IMGT.a.c + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.a.t + - dat$CDR1.IMGT.t.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$CDR1.IMGT.t.g + - dat$FR2.IMGT.a.c + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.a.t + - dat$FR2.IMGT.t.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$FR2.IMGT.t.g + - dat$CDR2.IMGT.a.c + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.a.t + - dat$CDR2.IMGT.t.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$CDR2.IMGT.t.g + - dat$FR3.IMGT.a.c + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.a.t + - dat$FR3.IMGT.t.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t + - dat$FR3.IMGT.t.g) +dat$transversionMutations = dat$FR1.IMGT.a.c + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.a.t + + dat$FR1.IMGT.t.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$FR1.IMGT.t.g + + dat$CDR1.IMGT.a.c + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.a.t + + dat$CDR1.IMGT.t.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$CDR1.IMGT.t.g + + dat$FR2.IMGT.a.c + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.a.t + + dat$FR2.IMGT.t.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$FR2.IMGT.t.g + + dat$CDR2.IMGT.a.c + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.a.t + + dat$CDR2.IMGT.t.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$CDR2.IMGT.t.g + + dat$FR3.IMGT.a.c + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.a.t + + dat$FR3.IMGT.t.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t + + dat$FR3.IMGT.t.g -result[3,"x"] = transversionMutations -result[3,"y"] = VRegionMutations -result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) -transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + +dat$transitionMutationsAtGC = dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$CDR1.IMGT.g.a + dat$CDR1.IMGT.c.t + @@ -215,9 +186,9 @@ dat$CDR2.IMGT.g.a + dat$CDR2.IMGT.c.t + dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t) + dat$FR3.IMGT.c.t -totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + +dat$totalMutationsAtGC = dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$FR1.IMGT.c.a + dat$FR1.IMGT.g.c + @@ -246,18 +217,99 @@ dat$FR3.IMGT.c.a + dat$FR3.IMGT.g.c + dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t) + dat$FR3.IMGT.g.t -result[4,"x"] = transitionMutationsAtGC -result[4,"y"] = totalMutationsAtGC -result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) - -result[5,"x"] = totalMutationsAtGC -result[5,"y"] = VRegionMutations -result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) -#transitiontable +setwd(outputdir) + +matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5) +for(i in 1:length(genes)){ + gene = genes[i] + tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] + if(gene == "."){ + tmp = dat + } + if(length(tmp) == 0){ + cat("0", file=paste(gene, "_value.txt" ,sep="")) + next + } + j = i - 1 + x = (j * 3) + 1 + y = (j * 3) + 2 + z = (j * 3) + 3 + matrx[1,x] = sum(tmp$VRegionMutations) + matrx[1,y] = sum(tmp$VRegionNucleotides) + matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + matrx[2,x] = sum(tmp$transitionMutations) + matrx[2,y] = sum(tmp$VRegionMutations) + matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + matrx[3,x] = sum(tmp$transversionMutations) + matrx[3,y] = sum(tmp$VRegionMutations) + matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + matrx[4,x] = sum(tmp$transitionMutationsAtGC) + matrx[4,y] = sum(tmp$totalMutationsAtGC) + matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + matrx[5,x] = sum(tmp$totalMutationsAtGC) + matrx[5,y] = sum(tmp$VRegionMutations) + matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + + transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + nts = c("a", "c", "g", "t") + + + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( tmp[,FR1] + + tmp[,CDR1] + + tmp[,FR2] + + tmp[,CDR2] + + tmp[,FR3]) + } + } + write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + + cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep="")) + cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep="")) +} + +#again for all of the data +tmp = dat +j = i +x = (j * 3) + 1 +y = (j * 3) + 2 +z = (j * 3) + 3 +matrx[1,x] = sum(tmp$VRegionMutations) +matrx[1,y] = sum(tmp$VRegionNucleotides) +matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) +matrx[2,x] = sum(tmp$transitionMutations) +matrx[2,y] = sum(tmp$VRegionMutations) +matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) +matrx[3,x] = sum(tmp$transversionMutations) +matrx[3,y] = sum(tmp$VRegionMutations) +matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) +matrx[4,x] = sum(tmp$transitionMutationsAtGC) +matrx[4,y] = sum(tmp$totalMutationsAtGC) +matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) +matrx[5,x] = sum(tmp$totalMutationsAtGC) +matrx[5,y] = sum(tmp$VRegionMutations) +matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) row.names(transitionTable) = c("A", "C", "G", "T") @@ -269,40 +321,88 @@ for(nt1 in nts){ - for(nt2 in nts){ - if(nt1 == nt2){ - next - } - NT1 = LETTERS[letters == nt1] - NT2 = LETTERS[letters == nt2] - FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") - CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") - FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") - CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") - FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( dat[,FR1] + - dat[,CDR1] + - dat[,FR2] + - dat[,CDR2] + - dat[,FR3]) - } + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( tmp[,FR1] + + tmp[,CDR1] + + tmp[,FR2] + + tmp[,CDR2] + + tmp[,FR3]) + } +} +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T) +cat(matrx[1,x], file="total_value.txt") +cat(length(tmp$Sequence.ID), file="total_n.txt") + + + +result = data.frame(matrx) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)") + +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + + +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +library(ggplot2) + +genesForPlot = gsub("[0-9]", "", dat$best_match) +genesForPlot = data.frame(table(genesForPlot)) +colnames(genesForPlot) = c("Gene","Freq") +genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + +pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) +pc = pc + geom_bar(width = 1, stat = "identity") +pc = pc + coord_polar(theta="y") +pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) + +png(filename="all.png") +pc +dev.off() + + +#blegh +genesForPlot = dat[grepl("ca", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) + + + png(filename="ca.png") + print(pc) + dev.off() } -setwd(outputdir) -write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) -write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) -cat(length(dat$Sequence.ID), file="n.txt") +genesForPlot = dat[grepl("cg", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")")) -weblogo = dat[,c("Sequence.ID", "VGene")] -weblogo$VGene = gsub("\\*.*", "", weblogo$VGene) - -rs12 = read.table("HS12RSS.txt", sep="\t", header=TRUE) -rs23 = read.table("HS23RSS.txt", sep="\t", header=TRUE) - -result12 = merge(weblogo, rs12, by.x="VGene", by.y="Gene") -result23 = merge(weblogo, rs23, by.x="VGene", by.y="Gene") - -write.table(x=result12$Sequence, file="weblogo_in_rs12.txt", sep=",",quote=F,row.names=F,col.names=F) -write.table(x=result23$Sequence, file="weblogo_in_rs23.txt", sep=",",quote=F,row.names=F,col.names=F) - + png(filename="cg.png") + print(pc) + dev.off() +}