Mercurial > repos > davidvanzessen > mutation_analysis

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/aa_histogram.r	Wed Apr 08 05:25:52 2015 -0400
@@ -0,0 +1,21 @@
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+input = args[1]
+outfile = args[2]
+
+dat = read.table(input, header=F, sep=",")
+dat=as.numeric(dat[1,])
+dat_sum = sum(dat)
+
+dat_freq = dat / dat_sum * 100
+dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+m = m + geom_histogram(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq))
+write.table(dat_dt, paste(dirname(outfile), "/aa_histogram.txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+
+png(filename=outfile, width=1280, height=720)
+print(m)
+dev.off()
--- a/merge_and_filter.r	Tue Apr 07 07:32:43 2015 -0400
+++ b/merge_and_filter.r	Wed Apr 08 05:25:52 2015 -0400
@@ -32,6 +32,7 @@
 higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70)
 unmatched = summ[!higher_than,]
 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+unmatched$best_match = paste("unmatched,", unmatched$best_match)
 summ = summ[higher_than,]

 if(length(summ$Sequence.ID) == 0){
--- a/mutation_analysis.py	Tue Apr 07 07:32:43 2015 -0400
+++ b/mutation_analysis.py	Wed Apr 08 05:25:52 2015 -0400
@@ -26,6 +26,9 @@
 cdr2Index = 0
 fr3Index = 0
 first=True
+IDlist = []
+mutationList = []
+
 with open(infile, 'r') as i:
   for line in i:
 		if first:
@@ -49,6 +52,23 @@
 		mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
 		mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
 		mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
+
+		mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+
+		IDlist += [ID]
+
+
+AA_mutation = [0] * (int(max(mutationList, key=lambda i:int(i[4]) if i[4] else 0)[4]) + 1)
+
+for mutation in mutationList:
+	if mutation[4]: #if non silent mutation
+		AA_mutation[int(mutation[4])] += 1
+
+print AA_mutation
+
+aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt"
+with open(aa_mutations_file, 'w') as o:
+	o.write(",".join([str(x) for x in AA_mutation]) + "\n")

 if linecount == 0:
 	print "No data, exiting"
@@ -72,7 +92,6 @@
 aggctatIndex = 0
 atagcctIndex = 0
 first = True
-IDlist = []
 with open(infile, 'r') as i:
 	for line in i:
 		if first:
@@ -94,7 +113,7 @@
 		WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
 		WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
 		TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-		IDlist += [ID]
+


 directory = outfile[:outfile.rfind("/") + 1]
--- a/mutation_analysis.r	Tue Apr 07 07:32:43 2015 -0400
+++ b/mutation_analysis.r	Wed Apr 08 05:25:52 2015 -0400
@@ -148,7 +148,7 @@

 setwd(outputdir)

-nts = c("a", "t", "g", "c")
+nts = c("a", "c", "g", "t")
 zeros=rep(0, 4)
 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7)
 for(i in 1:length(genes)){
@@ -298,11 +298,13 @@
 genesForPlot = data.frame(table(genesForPlot))
 colnames(genesForPlot) = c("Gene","Freq")
 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T)
+

 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
 pc = pc + geom_bar(width = 1, stat = "identity")
 pc = pc + coord_polar(theta="y")
-pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")"))

 png(filename="all.png")
 pc
@@ -319,8 +321,8 @@
 	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
 	pc = pc + geom_bar(width = 1, stat = "identity")
 	pc = pc + coord_polar(theta="y")
-	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
-
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")"))
+	write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T)

 	png(filename="ca.png")
 	print(pc)
@@ -336,8 +338,8 @@
 	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
 	pc = pc + geom_bar(width = 1, stat = "identity")
 	pc = pc + coord_polar(theta="y")
-	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")"))
-
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")"))
+	write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T)

 	png(filename="cg.png")
 	print(pc)
@@ -346,9 +348,11 @@

 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)

-p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales)
+p = ggplot(dat, aes(best_match, percentage_mutations))
 p = p + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) + geom_point(aes(colour=best_match), position="jitter")
 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
+

 png(filename="scatter.png")
 print(p)
--- a/wrapper.sh	Tue Apr 07 07:32:43 2015 -0400
+++ b/wrapper.sh	Wed Apr 08 05:25:52 2015 -0400
@@ -15,13 +15,10 @@
 cat $PWD/files/*/8_* > $PWD/mutationstats.txt
 cat $PWD/files/*/10_* > $PWD/hotspots.txt

-BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
+#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"

 echo "${BLASTN_DIR}"

-
-
-
 echo "identification ($method)"

 if [[ "${method}" == "custom" ]] ; then
@@ -50,6 +47,8 @@
 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
 echo "python mutation analysis"
 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt
+echo "R AA histogram"
+Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1

 cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt

@@ -79,17 +78,26 @@


 echo "<img src='all.png'/><br />" >> $output
+echo "<a href='all.txt'>download data</a><br />" >> $output
 if [ -a $outdir/ca.png ]
 then
 	echo "<img src='ca.png'/><br />" >> $output
+	echo "<a href='ca.txt'>download data</a><br />" >> $output
 fi
 if [ -a $outdir/cg.png ]
 then
 	echo "<img src='cg.png'/><br />" >> $output
+	echo "<a href='cg.txt'>download data</a><br />" >> $output
 fi
 if [ -a $outdir/scatter.png ]
 then
 	echo "<img src='scatter.png'/><br />" >> $output
+	echo "<a href='scatter.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/aa_histogram.png ]
+then
+	echo "<img src='aa_histogram.png'/><br />" >> $output
+	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
 fi

 for gene in ${genes[@]}