Mercurial > repos > davidvanzessen > mutation_analysis
diff mutation_analysis.r @ 0:74d2bc479bee draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 18 Aug 2014 04:04:37 -0400 |
parents | |
children | 2f4298673519 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,328 @@ +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +summaryinput = args[2] +outputdir = args[3] +setwd(outputdir) + +#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) +datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")] + +dat = merge(dat, datSum, by="Sequence.ID", all.x=T) + +#dat = dat[dat$Functionality == "productive",] + +dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) +dat$VGene = gsub("[*].*", "", dat$VGene) + +dat$past = paste(dat$AA.JUNCTION, dat$VGene) + +#dat = dat[!duplicated(dat$past), ] + +if(length(dat$Sequence.ID) == 0){ + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") +} + +#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) + +result = data.frame(x = 1:5, y = 1:5, z = 1:5) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + +cleanup_columns = c("FR1.IMGT.c.a", + "FR2.IMGT.g.t", + "CDR1.IMGT.Nb.of.nucleotides", + "CDR2.IMGT.t.a", + "FR1.IMGT.c.g", + "CDR1.IMGT.c.t", + "FR2.IMGT.a.c", + "FR2.IMGT.Nb.of.mutations", + "FR2.IMGT.g.c", + "FR2.IMGT.a.g", + "FR3.IMGT.t.a", + "FR3.IMGT.t.c", + "FR2.IMGT.g.a", + "FR3.IMGT.c.g", + "FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.g.a", + "CDR1.IMGT.t.g", + "CDR1.IMGT.g.c", + "CDR2.IMGT.Nb.of.nucleotides", + "FR2.IMGT.a.t", + "CDR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.a.g", + "FR3.IMGT.a.c", + "FR1.IMGT.g.a", + "FR3.IMGT.a.g", + "FR1.IMGT.a.t", + "CDR2.IMGT.a.g", + "CDR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.g.t", + "CDR2.IMGT.a.c", + "CDR1.IMGT.t.c", + "FR3.IMGT.g.c", + "FR1.IMGT.g.t", + "FR3.IMGT.g.t", + "CDR1.IMGT.a.t", + "FR1.IMGT.a.g", + "FR3.IMGT.a.t", + "FR3.IMGT.Nb.of.nucleotides", + "FR2.IMGT.t.c", + "CDR2.IMGT.g.a", + "FR2.IMGT.t.a", + "CDR1.IMGT.t.a", + "FR2.IMGT.t.g", + "FR3.IMGT.t.g", + "FR2.IMGT.Nb.of.nucleotides", + "FR1.IMGT.t.a", + "FR1.IMGT.t.g", + "FR3.IMGT.c.t", + "FR1.IMGT.t.c", + "CDR2.IMGT.a.t", + "FR2.IMGT.c.t", + "CDR1.IMGT.g.t", + "CDR2.IMGT.t.g", + "FR1.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.c.g", + "CDR2.IMGT.t.c", + "FR3.IMGT.g.a", + "CDR1.IMGT.a.c", + "FR2.IMGT.c.a", + "FR3.IMGT.Nb.of.mutations", + "FR2.IMGT.c.g", + "CDR2.IMGT.g.c", + "FR1.IMGT.g.c", + "CDR2.IMGT.c.t", + "FR3.IMGT.c.a", + "CDR1.IMGT.c.a", + "CDR2.IMGT.c.g", + "CDR2.IMGT.c.a", + "FR1.IMGT.c.t") + +for(col in cleanup_columns){ + dat[,col] = gsub("\\(.*\\)", "", dat[,col]) + #dat[dat[,col] == "",] = "0" + dat[,col] = as.numeric(dat[,col]) + dat[is.na(dat[,col]),] = 0 +} + +VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + + dat$CDR1.IMGT.Nb.of.mutations + + dat$FR2.IMGT.Nb.of.mutations + + dat$CDR2.IMGT.Nb.of.mutations + + dat$FR3.IMGT.Nb.of.mutations) + +VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + + dat$CDR1.IMGT.Nb.of.nucleotides + + dat$FR2.IMGT.Nb.of.nucleotides + + dat$CDR2.IMGT.Nb.of.nucleotides + + dat$FR3.IMGT.Nb.of.nucleotides) + +result[1,"x"] = VRegionMutations +result[1,"y"] = VRegionNucleotides +result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) + +transitionMutations = sum(dat$FR1.IMGT.a.g + + dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.t.c + + dat$CDR1.IMGT.a.g + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.t.c + + dat$FR2.IMGT.a.g + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.t.c + + dat$CDR2.IMGT.a.g + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.t.c + + dat$FR3.IMGT.a.g + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.t.c) + +result[2,"x"] = transitionMutations +result[2,"y"] = VRegionMutations +result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) + +transversionMutations = sum( dat$FR1.IMGT.a.c + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.a.t + + dat$FR1.IMGT.t.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$FR1.IMGT.t.g + + dat$CDR1.IMGT.a.c + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.a.t + + dat$CDR1.IMGT.t.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$CDR1.IMGT.t.g + + dat$FR2.IMGT.a.c + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.a.t + + dat$FR2.IMGT.t.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$FR2.IMGT.t.g + + dat$CDR2.IMGT.a.c + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.a.t + + dat$CDR2.IMGT.t.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$CDR2.IMGT.t.g + + dat$FR3.IMGT.a.c + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.a.t + + dat$FR3.IMGT.t.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t + + dat$FR3.IMGT.t.g) + +result[3,"x"] = transversionMutations +result[3,"y"] = VRegionMutations +result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) + +transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t) + +totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t) + +result[4,"x"] = transitionMutationsAtGC +result[4,"y"] = totalMutationsAtGC +result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) + +result[5,"x"] = totalMutationsAtGC +result[5,"y"] = VRegionMutations +result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) + + +#transitiontable + +transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) +row.names(transitionTable) = c("A", "C", "G", "T") +transitionTable["A","A"] = NA +transitionTable["C","C"] = NA +transitionTable["G","G"] = NA +transitionTable["T","T"] = NA +nts = c("a", "c", "g", "t") + + +for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( dat[,FR1] + + dat[,CDR1] + + dat[,FR2] + + dat[,CDR2] + + dat[,FR3]) + } +} + +setwd(outputdir) +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +cat(length(dat$Sequence.ID), file="n.txt") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +