mutation_analysis: mutation_analysis.r comparison

comparison mutation_analysis.r @ 118:ad7ca9c2b748 draft

Uploaded

author	davidvanzessen
date	Thu, 11 Aug 2016 08:00:00 -0400
parents	ede6c4ee5196
children	626a956f3811

comparison

equal deleted inserted replaced

-:a8f91c52411c
+:ad7ca9c2b748
 "FR3.IMGT.Nb.of.silent.mutations",
 "FR1.IMGT.Nb.of.nonsilent.mutations",
 "FR2.IMGT.Nb.of.nonsilent.mutations",
 "FR3.IMGT.Nb.of.nonsilent.mutations")
+print("Cleaning up columns")
 for(col in cleanup_columns){
 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
 #dat[dat[,col] == "",] = "0"
 dat[,col] = as.numeric(dat[,col])
 dat[is.na(dat[,col]),col] = 0
 if(!include_fr1){
 	regions = c("CDR1", "FR2", "CDR2", "FR3")
 }
 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
+print("aggregating data into new columns")
 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")
 dat$VRegionMutations =  apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)
 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")
 zeros=rep(0, 4)
 funcs = c(median, sum, mean)
 fnames = c("median", "sum", "mean")
+print("Creating result tables")
 for(i in 1:length(funcs)){
 	func = funcs[[i]]
 	fname = fnames[[i]]
 	rows = 9
 	if(fname == "sum"){
 		rows = 11
 	}
 	matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows)
 	for(i in 1:length(genes)){
-	  matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
+		print(paste("Creating table for", fname, genes[i]))
+		matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
 	}
 	matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
 	result = data.frame(matrx)
 	}
 	write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
 }
+print("Adding median number of mutations to sum table")
 sum.table = read.table("mutations_sum.txt", sep=",", header=F)
 median.table = read.table("mutations_median.txt", sep=",", header=F)
-#sum.table["Median of Number of Mutations (%)",] = median.table[1,]
 new.table = sum.table[1,]
 new.table[2,] = median.table[1,]
 new.table[3:12,] = sum.table[2:11,]
 new.table[,1] = as.character(new.table[,1])
 new.table[2,1] = "Median of Number of Mutations (%)"
 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]
 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)
+print("Plotting ca piechart")
 dat = dat[!grepl("^unmatched", dat$best_match),]
 #blegh
 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
 	png(filename="ca.png")
 	print(pc)
 	dev.off()
 }
+print("Plotting cg piechart")
 genesForPlot = dat[grepl("cg", dat$best_match),]$best_match
 if(length(genesForPlot) > 0){
 	genesForPlot = data.frame(table(genesForPlot))
 	colnames(genesForPlot) = c("Gene","Freq")
 	genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
 	png(filename="cg.png")
 	print(pc)
 	dev.off()
 }
+print("Plotting scatterplot")
 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
 p = ggplot(dat, aes(best_match, percentage_mutations))
 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
+print("Plotting frequency ranges plot")
 dat$best_match_class = substr(dat$best_match, 0, 2)
 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])

Mercurial > repos > davidvanzessen > mutation_analysis

comparison mutation_analysis.r @ 118:ad7ca9c2b748 draft