# HG changeset patch
# User davidvanzessen
# Date 1468411453 14400
# Node ID 01c9993865af6bb421270e745d3df07aa0564ee0
# Parent 074ae1e30e8f74308fe20287b9d12898a3b1771a
Uploaded
diff -r 074ae1e30e8f -r 01c9993865af aa_histogram.r
--- a/aa_histogram.r Wed Jun 29 05:13:25 2016 -0400
+++ b/aa_histogram.r Wed Jul 13 08:04:13 2016 -0400
@@ -2,38 +2,52 @@
args <- commandArgs(trailingOnly = TRUE)
-input = args[1]
-outfile = args[2]
-gene = args[3]
+mutations.by.id.file = args[1]
+absent.aa.by.id.file = args[2]
+genes = strsplit(args[3], ",")[[1]]
+genes = c(genes, "")
+outdir = args[4]
+
print("---------------- read input ----------------")
-dat = read.table(input, sep="\t", fill=T, header=T, quote="")
-
-print("---------------- as numeric ----------------")
-
-mutations.at.position = as.numeric(dat[1,])
-aa.at.position = as.numeric(dat[2,])
+mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
+absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
-print("---------------- freq data.frame ----------------")
+for(gene in genes){
+
+ if(gene == ""){
+ mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
+ } else {
+ mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
+ }
+ if(nrow(mutations.by.id.gene) == 0){
+ next
+ }
-dat_freq = mutations.at.position / aa.at.position
-dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
-
-print("---------------- plot ----------------")
+ mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
+ aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
-m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq))
-m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
-m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
-m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
-m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
-m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
-m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
+ dat_freq = mutations.at.position / aa.at.position
+ dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+ print("---------------- plot ----------------")
-print("---------------- write/print ----------------")
+ m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+ m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq))
+ m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
+ m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
+ m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
+ m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
+ m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
+ m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
-write.table(dat_dt, paste(dirname(outfile), "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-png(filename=outfile, width=1280, height=720)
-print(m)
-dev.off()
+ print("---------------- write/print ----------------")
+
+ write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
+ print(m)
+ dev.off()
+}
diff -r 074ae1e30e8f -r 01c9993865af mutation_analysis.py
--- a/mutation_analysis.py Wed Jun 29 05:13:25 2016 -0400
+++ b/mutation_analysis.py Wed Jul 13 08:04:13 2016 -0400
@@ -86,6 +86,7 @@
aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
with open(aa_mutations_by_id_file, 'w') as o:
+ o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
for ID in mutationListByID.keys():
AA_mutation_for_ID = AA_mutation_empty[:]
for mutation in mutationListByID[ID]:
@@ -95,7 +96,7 @@
AA_mutation_for_ID[AA_mutation_position] += 1
clss = genedic[ID][:2]
AA_mutation_dic[clss][AA_mutation_position] += 1
- o.write(ID + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
+ o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
@@ -133,7 +134,7 @@
aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
with open(aa_mutations_by_id_file, 'w') as o:
- o.write("ID\tcdr1length\tcdr2length\t" + "\t".join([str(x) for x in range(1,AALength-1)]) + "\n")
+ o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
for ID in IDlist:
absentAAbyID = [1] * (AALength-1)
cdr1Length = cdr1LengthDic[ID]
@@ -143,7 +144,7 @@
cdr2Length = cdr2LengthDic[ID]
for c in absentAACDR2Dic[cdr2Length]:
absentAAbyID[c] -= 1
- o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
+ o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
diff -r 074ae1e30e8f -r 01c9993865af wrapper.sh
--- a/wrapper.sh Wed Jun 29 05:13:25 2016 -0400
+++ b/wrapper.sh Wed Jul 13 08:04:13 2016 -0400
@@ -148,22 +148,9 @@
echo "---------------- aa_histogram.r ----------------"
echo "---------------- aa_histogram.r ----------------
" >> $log
-Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png "" 2>&1
-
-echo "---------------- aa_histogram.r ca ----------------"
-echo "---------------- aa_histogram.r ca ----------------
" >> $log
-
-Rscript $dir/aa_histogram.r $outdir/aa_mutations_ca.txt $outdir/aa_histogram_ca.png "ca" 2>&1
-
-echo "---------------- aa_histogram.r cg ----------------"
-echo "---------------- aa_histogram.r cg ----------------
" >> $log
-
-Rscript $dir/aa_histogram.r $outdir/aa_mutations_cg.txt $outdir/aa_histogram_cg.png "cg" 2>&1
-
-echo "---------------- aa_histogram.r cm ----------------"
-echo "---------------- aa_histogram.r cm ----------------
" >> $log
-
-Rscript $dir/aa_histogram.r $outdir/aa_mutations_cm.txt $outdir/aa_histogram_cm.png "cm" 2>&1
+cp $outdir/aa_mutations.txt $outdir/aa_mutations_count.txt
+Rscript $dir/aa_histogram.r $outdir/aa_id_mutations.txt $outdir/absent_aa_id.txt "ca,cg,cm" $outdir/ 2>&1
+mv $outdir/aa_histogram_.png $outdir/aa_histogram.png
genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
@@ -384,5 +371,5 @@
cp $outdir/index.html $log
echo "---------------- Done! ----------------"
-echo "---------------- Done! ----------------
" >> $log
+echo "---------------- Done! ----------------
" >> $outdir/log.html