# HG changeset patch
# User davidvanzessen
# Date 1428485152 14400
# Node ID 2433a1e110e1415a24f6cbe678d0eef78fcaab7d
# Parent 58a62d2c03774adf051e977a28e6c98ec8c75149
Uploaded
diff -r 58a62d2c0377 -r 2433a1e110e1 aa_histogram.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aa_histogram.r Wed Apr 08 05:25:52 2015 -0400
@@ -0,0 +1,21 @@
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+input = args[1]
+outfile = args[2]
+
+dat = read.table(input, header=F, sep=",")
+dat=as.numeric(dat[1,])
+dat_sum = sum(dat)
+
+dat_freq = dat / dat_sum * 100
+dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+m = m + geom_histogram(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq))
+write.table(dat_dt, paste(dirname(outfile), "/aa_histogram.txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+
+png(filename=outfile, width=1280, height=720)
+print(m)
+dev.off()
diff -r 58a62d2c0377 -r 2433a1e110e1 merge_and_filter.r
--- a/merge_and_filter.r Tue Apr 07 07:32:43 2015 -0400
+++ b/merge_and_filter.r Wed Apr 08 05:25:52 2015 -0400
@@ -32,6 +32,7 @@
higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70)
unmatched = summ[!higher_than,]
unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+unmatched$best_match = paste("unmatched,", unmatched$best_match)
summ = summ[higher_than,]
if(length(summ$Sequence.ID) == 0){
diff -r 58a62d2c0377 -r 2433a1e110e1 mutation_analysis.py
--- a/mutation_analysis.py Tue Apr 07 07:32:43 2015 -0400
+++ b/mutation_analysis.py Wed Apr 08 05:25:52 2015 -0400
@@ -26,6 +26,9 @@
cdr2Index = 0
fr3Index = 0
first=True
+IDlist = []
+mutationList = []
+
with open(infile, 'r') as i:
for line in i:
if first:
@@ -49,6 +52,23 @@
mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
+
+ mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+
+ IDlist += [ID]
+
+
+AA_mutation = [0] * (int(max(mutationList, key=lambda i:int(i[4]) if i[4] else 0)[4]) + 1)
+
+for mutation in mutationList:
+ if mutation[4]: #if non silent mutation
+ AA_mutation[int(mutation[4])] += 1
+
+print AA_mutation
+
+aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt"
+with open(aa_mutations_file, 'w') as o:
+ o.write(",".join([str(x) for x in AA_mutation]) + "\n")
if linecount == 0:
print "No data, exiting"
@@ -72,7 +92,6 @@
aggctatIndex = 0
atagcctIndex = 0
first = True
-IDlist = []
with open(infile, 'r') as i:
for line in i:
if first:
@@ -94,7 +113,7 @@
WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
- IDlist += [ID]
+
directory = outfile[:outfile.rfind("/") + 1]
diff -r 58a62d2c0377 -r 2433a1e110e1 mutation_analysis.r
--- a/mutation_analysis.r Tue Apr 07 07:32:43 2015 -0400
+++ b/mutation_analysis.r Wed Apr 08 05:25:52 2015 -0400
@@ -148,7 +148,7 @@
setwd(outputdir)
-nts = c("a", "t", "g", "c")
+nts = c("a", "c", "g", "t")
zeros=rep(0, 4)
matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7)
for(i in 1:length(genes)){
@@ -298,11 +298,13 @@
genesForPlot = data.frame(table(genesForPlot))
colnames(genesForPlot) = c("Gene","Freq")
genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
pc = pc + geom_bar(width = 1, stat = "identity")
pc = pc + coord_polar(theta="y")
-pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")"))
png(filename="all.png")
pc
@@ -319,8 +321,8 @@
pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
pc = pc + geom_bar(width = 1, stat = "identity")
pc = pc + coord_polar(theta="y")
- pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
-
+ pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")"))
+ write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T)
png(filename="ca.png")
print(pc)
@@ -336,8 +338,8 @@
pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
pc = pc + geom_bar(width = 1, stat = "identity")
pc = pc + coord_polar(theta="y")
- pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")"))
-
+ pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")"))
+ write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T)
png(filename="cg.png")
print(pc)
@@ -346,9 +348,11 @@
dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
-p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales)
+p = ggplot(dat, aes(best_match, percentage_mutations))
p = p + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) + geom_point(aes(colour=best_match), position="jitter")
p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")
+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
png(filename="scatter.png")
print(p)
diff -r 58a62d2c0377 -r 2433a1e110e1 wrapper.sh
--- a/wrapper.sh Tue Apr 07 07:32:43 2015 -0400
+++ b/wrapper.sh Wed Apr 08 05:25:52 2015 -0400
@@ -15,13 +15,10 @@
cat $PWD/files/*/8_* > $PWD/mutationstats.txt
cat $PWD/files/*/10_* > $PWD/hotspots.txt
-BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
+#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
echo "${BLASTN_DIR}"
-
-
-
echo "identification ($method)"
if [[ "${method}" == "custom" ]] ; then
@@ -50,6 +47,8 @@
Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
echo "python mutation analysis"
python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt
+echo "R AA histogram"
+Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt
@@ -79,17 +78,26 @@
echo "
" >> $output
+echo "download data
" >> $output
if [ -a $outdir/ca.png ]
then
echo "
" >> $output
+ echo "download data
" >> $output
fi
if [ -a $outdir/cg.png ]
then
echo "
" >> $output
+ echo "download data
" >> $output
fi
if [ -a $outdir/scatter.png ]
then
echo "
" >> $output
+ echo "download data
" >> $output
+fi
+if [ -a $outdir/aa_histogram.png ]
+then
+ echo "
" >> $output
+ echo "download data
" >> $output
fi
for gene in ${genes[@]}