Mercurial > repos > davidvanzessen > report_clonality_tcell_igg
diff RScript.r @ 2:fd1b76816395 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 27 Mar 2014 10:54:03 -0400 |
parents | 1d429107cd26 |
children | b850200d4335 |
line wrap: on
line diff
--- a/RScript.r Tue Mar 11 11:13:33 2014 -0400 +++ b/RScript.r Thu Mar 27 10:54:03 2014 -0400 @@ -6,7 +6,6 @@ outFile = args[2] outDir = args[3] clonalType = args[4] -species = args[5] if (!("gridExtra" %in% rownames(installed.packages()))) { install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") @@ -26,11 +25,18 @@ } library(data.table) +if (!("reshape2" %in% rownames(installed.packages()))) { + install.packages("reshape2", repos="http://cran.xl-mirror.nl/") +} +library(reshape2) + test = read.table(inFile, sep="\t", header=TRUE, fill=T, comment.char="") test = test[test$Sample != "",] + + test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene) @@ -50,23 +56,31 @@ PRODF = PROD #PRODF = unique(PRODF) -PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] +if(any(grepl(pattern="_", x=PRODF$ID))){ -PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")]) + PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) + PRODF$freq = gsub("_.*", "", PRODF$freq) + PRODF$freq = as.numeric(PRODF$freq) +} else { + PRODF$freq = 1 + PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] +} + +PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")]) PRODFV$Length = as.numeric(PRODFV$Length) Total = 0 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) -PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")]) +PRODFD = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.D.Gene")]) PRODFD$Length = as.numeric(PRODFD$Length) Total = 0 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) -PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")]) +PRODFJ = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.J.Gene")]) PRODFJ$Length = as.numeric(PRODFJ$Length) Total = 0 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) @@ -94,11 +108,12 @@ setwd(outDir) -write.table(PRODF, "allUnique.tsv", sep="\t",quote=F,row.names=F,col.names=T) +write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) pV = ggplot(PRODFV) pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") +write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) png("VPlot.png",width = 1280, height = 720) pV @@ -107,6 +122,7 @@ pD = ggplot(PRODFD) pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") +write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) png("DPlot.png",width = 800, height = 600) pD @@ -115,18 +131,76 @@ pJ = ggplot(PRODFJ) pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") +write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) png("JPlot.png",width = 800, height = 600) pJ dev.off(); +VGenes = PRODF[,c("Sample", "Top.V.Gene", "freq")] +VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) +VGenes = data.frame(data.table(VGenes)[, list(Count=sum(freq)), by=c("Sample", "Top.V.Gene")]) +TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) +VGenes = merge(VGenes, TotalPerSample, by="Sample") +VGenes$Frequency = VGenes$Count * 100 / VGenes$total +VPlot = ggplot(VGenes) +VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of V gene families") + + ylab("Percentage of sequences") +png("VFPlot.png") +VPlot +dev.off(); +write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +DGenes = PRODF[,c("Sample", "Top.D.Gene", "freq")] +DGenes$Top.D.Gene = gsub("-.*", "", DGenes$Top.D.Gene) +DGenes = data.frame(data.table(DGenes)[, list(Count=sum(freq)), by=c("Sample", "Top.D.Gene")]) +TotalPerSample = data.frame(data.table(DGenes)[, list(total=sum(.SD$Count)), by=Sample]) +DGenes = merge(DGenes, TotalPerSample, by="Sample") +DGenes$Frequency = DGenes$Count * 100 / DGenes$total +DPlot = ggplot(DGenes) +DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of D gene families") + + ylab("Percentage of sequences") +png("DFPlot.png") +DPlot +dev.off(); +write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +JGenes = PRODF[,c("Sample", "Top.J.Gene", "freq")] +JGenes$Top.J.Gene = gsub("-.*", "", JGenes$Top.J.Gene) +JGenes = data.frame(data.table(JGenes)[, list(Count=sum(freq)), by=c("Sample", "Top.J.Gene")]) +TotalPerSample = data.frame(data.table(JGenes)[, list(total=sum(.SD$Count)), by=Sample]) +JGenes = merge(JGenes, TotalPerSample, by="Sample") +JGenes$Frequency = JGenes$Count * 100 / JGenes$total +JPlot = ggplot(JGenes) +JPlot = JPlot + geom_bar(aes( x = Top.J.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of J gene families") + + ylab("Percentage of sequences") +png("JFPlot.png") +JPlot +dev.off(); +write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +CDR3Length = data.frame(data.table(PRODF)[, list(Count=sum(freq)), by=c("Sample", "CDR3.Length.DNA")]) +TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) +CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") +CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total +CDR3LengthPlot = ggplot(CDR3Length) +CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Length distribution of CDR3") + + xlab("CDR3 Length") + + ylab("Percentage of sequences") +png("CDR3LengthPlot.png",width = 1280, height = 720) +CDR3LengthPlot +dev.off() +write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) + revVchain = Vchain revDchain = Dchain revVchain$chr.orderV = rev(revVchain$chr.orderV) revDchain$chr.orderD = rev(revDchain$chr.orderD) -cat("before VD", "\n") - plotVD <- function(dat){ if(length(dat[,1]) == 0){ return() @@ -141,10 +215,12 @@ png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) print(img) + dev.off() + write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) } -VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) +VandDCount = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) VandDCount$l = log(VandDCount$Length) maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) @@ -160,9 +236,7 @@ lapply(VDList, FUN=plotVD) -cat("after VD", "\n") -cat("before VJ", "\n") plotVJ <- function(dat){ if(length(dat[,1]) == 0){ @@ -179,9 +253,10 @@ png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) print(img) dev.off() + write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) } -VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) +VandJCount = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) VandJCount$l = log(VandJCount$Length) maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) @@ -196,10 +271,6 @@ VJList = split(completeVJ, f=completeVJ[,"Sample"]) lapply(VJList, FUN=plotVJ) -cat("after VJ", "\n") - -cat("before DJ", "\n") - plotDJ <- function(dat){ if(length(dat[,1]) == 0){ return() @@ -215,6 +286,7 @@ png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) print(img) dev.off() + write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) } DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) @@ -232,7 +304,6 @@ DJList = split(completeDJ, f=completeDJ[,"Sample"]) lapply(DJList, FUN=plotDJ) -cat("after DJ", "\n") sampleFile <- file("samples.txt") un = unique(test$Sample) @@ -241,27 +312,26 @@ close(sampleFile) - if("Replicate" %in% colnames(test)) { clonalityFrame = PROD clonalityFrame$ReplicateConcat = do.call(paste, c(clonalityFrame[c("VDJCDR3", "Sample", "Replicate")], sep = ":")) clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] - write.table(clonalityFrame, "clonalityComplete.tsv", sep="\t",quote=F,row.names=F,col.names=T) + write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) ClonalitySampleReplicatePrint <- function(dat){ - write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".tsv", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".csv", sep=""), sep=",",quote=F,row.names=F,col.names=T) } clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,c("Sample", "Replicate")]) - lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint) + #lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint) ClonalitySamplePrint <- function(dat){ - write.table(dat, paste("clonality_", unique(dat$Sample) , ".tsv", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(dat, paste("clonality_", unique(dat$Sample) , ".csv", sep=""), sep=",",quote=F,row.names=F,col.names=T) } clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,"Sample"]) - lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint) + #lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint) clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "VDJCDR3")]) clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")])