Mercurial > repos > davidvanzessen > mutation_analysis
changeset 26:2433a1e110e1 draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 08 Apr 2015 05:25:52 -0400 |
parents | 58a62d2c0377 |
children | c9c95b96b7cc |
files | aa_histogram.r merge_and_filter.r mutation_analysis.py mutation_analysis.r wrapper.sh |
diffstat | 5 files changed, 66 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aa_histogram.r Wed Apr 08 05:25:52 2015 -0400 @@ -0,0 +1,21 @@ +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +outfile = args[2] + +dat = read.table(input, header=F, sep=",") +dat=as.numeric(dat[1,]) +dat_sum = sum(dat) + +dat_freq = dat / dat_sum * 100 +dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) + +m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +m = m + geom_histogram(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq)) +write.table(dat_dt, paste(dirname(outfile), "/aa_histogram.txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + +png(filename=outfile, width=1280, height=720) +print(m) +dev.off()
--- a/merge_and_filter.r Tue Apr 07 07:32:43 2015 -0400 +++ b/merge_and_filter.r Wed Apr 08 05:25:52 2015 -0400 @@ -32,6 +32,7 @@ higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70) unmatched = summ[!higher_than,] unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] +unmatched$best_match = paste("unmatched,", unmatched$best_match) summ = summ[higher_than,] if(length(summ$Sequence.ID) == 0){
--- a/mutation_analysis.py Tue Apr 07 07:32:43 2015 -0400 +++ b/mutation_analysis.py Wed Apr 08 05:25:52 2015 -0400 @@ -26,6 +26,9 @@ cdr2Index = 0 fr3Index = 0 first=True +IDlist = [] +mutationList = [] + with open(infile, 'r') as i: for line in i: if first: @@ -49,6 +52,23 @@ mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] + + mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + + IDlist += [ID] + + +AA_mutation = [0] * (int(max(mutationList, key=lambda i:int(i[4]) if i[4] else 0)[4]) + 1) + +for mutation in mutationList: + if mutation[4]: #if non silent mutation + AA_mutation[int(mutation[4])] += 1 + +print AA_mutation + +aa_mutations_file = outfile[:outfile.rindex("/")] + "/aa_mutations.txt" +with open(aa_mutations_file, 'w') as o: + o.write(",".join([str(x) for x in AA_mutation]) + "\n") if linecount == 0: print "No data, exiting" @@ -72,7 +92,6 @@ aggctatIndex = 0 atagcctIndex = 0 first = True -IDlist = [] with open(infile, 'r') as i: for line in i: if first: @@ -94,7 +113,7 @@ WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - IDlist += [ID] + directory = outfile[:outfile.rfind("/") + 1]
--- a/mutation_analysis.r Tue Apr 07 07:32:43 2015 -0400 +++ b/mutation_analysis.r Wed Apr 08 05:25:52 2015 -0400 @@ -148,7 +148,7 @@ setwd(outputdir) -nts = c("a", "t", "g", "c") +nts = c("a", "c", "g", "t") zeros=rep(0, 4) matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=7) for(i in 1:length(genes)){ @@ -298,11 +298,13 @@ genesForPlot = data.frame(table(genesForPlot)) colnames(genesForPlot) = c("Gene","Freq") genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) +write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T) + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) pc = pc + geom_bar(width = 1, stat = "identity") pc = pc + coord_polar(theta="y") -pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) +pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")")) png(filename="all.png") pc @@ -319,8 +321,8 @@ pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) pc = pc + geom_bar(width = 1, stat = "identity") pc = pc + coord_polar(theta="y") - pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) - + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T) png(filename="ca.png") print(pc) @@ -336,8 +338,8 @@ pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) pc = pc + geom_bar(width = 1, stat = "identity") pc = pc + coord_polar(theta="y") - pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")")) - + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T) png(filename="cg.png") print(pc) @@ -346,9 +348,11 @@ dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) -p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales) +p = ggplot(dat, aes(best_match, percentage_mutations)) p = p + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) + geom_point(aes(colour=best_match), position="jitter") p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") +write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) + png(filename="scatter.png") print(p)
--- a/wrapper.sh Tue Apr 07 07:32:43 2015 -0400 +++ b/wrapper.sh Wed Apr 08 05:25:52 2015 -0400 @@ -15,13 +15,10 @@ cat $PWD/files/*/8_* > $PWD/mutationstats.txt cat $PWD/files/*/10_* > $PWD/hotspots.txt -BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" +#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" echo "${BLASTN_DIR}" - - - echo "identification ($method)" if [[ "${method}" == "custom" ]] ; then @@ -50,6 +47,8 @@ Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1 echo "python mutation analysis" python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt +echo "R AA histogram" +Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1 cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt @@ -79,17 +78,26 @@ echo "<img src='all.png'/><br />" >> $output +echo "<a href='all.txt'>download data</a><br />" >> $output if [ -a $outdir/ca.png ] then echo "<img src='ca.png'/><br />" >> $output + echo "<a href='ca.txt'>download data</a><br />" >> $output fi if [ -a $outdir/cg.png ] then echo "<img src='cg.png'/><br />" >> $output + echo "<a href='cg.txt'>download data</a><br />" >> $output fi if [ -a $outdir/scatter.png ] then echo "<img src='scatter.png'/><br />" >> $output + echo "<a href='scatter.txt'>download data</a><br />" >> $output +fi +if [ -a $outdir/aa_histogram.png ] +then + echo "<img src='aa_histogram.png'/><br />" >> $output + echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output fi for gene in ${genes[@]}