Mercurial > repos > davidvanzessen > mutation_analysis
diff mutation_analysis.r @ 22:d84c9791d8c4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 07 Apr 2015 03:52:34 -0400 |
parents | cb7c65e3e43f |
children | 28b8d980db22 |
line wrap: on
line diff
--- a/mutation_analysis.r Thu Apr 02 03:31:23 2015 -0400 +++ b/mutation_analysis.r Tue Apr 07 03:52:34 2015 -0400 @@ -3,6 +3,8 @@ input = args[1] genes = unlist(strsplit(args[2], ",")) outputdir = args[3] +print(args[4]) +include_fr1 = ifelse(args[4] == "yes", T, F) setwd(outputdir) dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) @@ -102,105 +104,38 @@ dat[is.na(dat[,col]),] = 0 } -dat$VRegionMutations = dat$CDR1.IMGT.Nb.of.mutations + - dat$FR2.IMGT.Nb.of.mutations + - dat$CDR2.IMGT.Nb.of.mutations + - dat$FR3.IMGT.Nb.of.mutations - -dat$VRegionNucleotides = dat$CDR1.IMGT.Nb.of.nucleotides + - dat$FR2.IMGT.Nb.of.nucleotides + - dat$CDR2.IMGT.Nb.of.nucleotides + - dat$FR3.IMGT.Nb.of.nucleotides +regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") +if(!include_fr1){ + regions = c("CDR1", "FR2", "CDR2", "FR3") +} -dat$transitionMutations = dat$CDR1.IMGT.a.g + - dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$CDR1.IMGT.t.c + - dat$FR2.IMGT.a.g + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$FR2.IMGT.t.c + - dat$CDR2.IMGT.a.g + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$CDR2.IMGT.t.c + - dat$FR3.IMGT.a.g + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t + - dat$FR3.IMGT.t.c +sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } -dat$transversionMutations = dat$CDR1.IMGT.a.c + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.a.t + - dat$CDR1.IMGT.t.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$CDR1.IMGT.t.g + - dat$FR2.IMGT.a.c + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.a.t + - dat$FR2.IMGT.t.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$FR2.IMGT.t.g + - dat$CDR2.IMGT.a.c + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.a.t + - dat$CDR2.IMGT.t.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$CDR2.IMGT.t.g + - dat$FR3.IMGT.a.c + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.a.t + - dat$FR3.IMGT.t.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t + - dat$FR3.IMGT.t.g +VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") +dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) + +VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") +dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns) + +transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="") +dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) + +transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") +dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) -dat$transitionMutationsAtGC = dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t +transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") +dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) -dat$totalMutationsAtGC = dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t +totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="") +dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) setwd(outputdir) +nts = c("a", "t", "g", "c") +zeros=rep(0, 4) matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5) for(i in 1:length(genes)){ gene = genes[i] @@ -208,10 +143,6 @@ if(gene == "."){ tmp = dat } - if(length(tmp) == 0){ - cat("0", file=paste(gene, "_value.txt" ,sep="")) - next - } j = i - 1 x = (j * 3) + 1 y = (j * 3) + 2 @@ -232,33 +163,36 @@ matrx[5,y] = sum(tmp$VRegionMutations) matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) - transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) + transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) row.names(transitionTable) = c("A", "C", "G", "T") transitionTable["A","A"] = NA transitionTable["C","C"] = NA transitionTable["G","G"] = NA transitionTable["T","T"] = NA - nts = c("a", "c", "g", "t") + + if(nrow(tmp) > 0){ + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } + } + } + } - for(nt1 in nts){ - for(nt2 in nts){ - if(nt1 == nt2){ - next - } - NT1 = LETTERS[letters == nt1] - NT2 = LETTERS[letters == nt2] - FR1 = 0 #paste("FR1.IMGT.", nt1, ".", nt2, sep="") - CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") - FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") - CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") - FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( tmp[,CDR1] + - tmp[,FR2] + - tmp[,CDR2] + - tmp[,FR3]) - } - } write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) @@ -294,7 +228,6 @@ transitionTable["C","C"] = NA transitionTable["G","G"] = NA transitionTable["T","T"] = NA -nts = c("a", "c", "g", "t") for(nt1 in nts){ @@ -309,10 +242,11 @@ FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( tmp[,CDR1] + - tmp[,FR2] + - tmp[,CDR2] + - tmp[,FR3]) + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } } } write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) @@ -382,3 +316,45 @@ print(pc) dev.off() } + +dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) + +p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales) +p = p + geom_point(aes(colour=best_match), position="jitter") +p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + +png(filename="scatter.png") +print(p) +dev.off() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +