Mercurial > repos > davidvanzessen > mutation_analysis
changeset 22:d84c9791d8c4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 07 Apr 2015 03:52:34 -0400 |
parents | c9f9623f1f76 |
children | 28b8d980db22 |
files | mutation_analysis.py mutation_analysis.r mutation_analysis.xml wrapper.sh |
diffstat | 4 files changed, 107 insertions(+), 127 deletions(-) [+] |
line wrap: on
line diff
--- a/mutation_analysis.py Thu Apr 02 03:31:23 2015 -0400 +++ b/mutation_analysis.py Tue Apr 07 03:52:34 2015 -0400 @@ -98,7 +98,7 @@ directory = outfile[:outfile.rfind("/") + 1] -value = 0; +value = 0 valuedic = dict() for gene in genes: with open(directory + gene + "_value.txt", 'r') as v: @@ -137,9 +137,4 @@ o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") first = False continue - print ID - print RGYWCount[ID] - print WRCYCount[ID] - print WACount[ID] - print TWCount[ID] o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n")
--- a/mutation_analysis.r Thu Apr 02 03:31:23 2015 -0400 +++ b/mutation_analysis.r Tue Apr 07 03:52:34 2015 -0400 @@ -3,6 +3,8 @@ input = args[1] genes = unlist(strsplit(args[2], ",")) outputdir = args[3] +print(args[4]) +include_fr1 = ifelse(args[4] == "yes", T, F) setwd(outputdir) dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) @@ -102,105 +104,38 @@ dat[is.na(dat[,col]),] = 0 } -dat$VRegionMutations = dat$CDR1.IMGT.Nb.of.mutations + - dat$FR2.IMGT.Nb.of.mutations + - dat$CDR2.IMGT.Nb.of.mutations + - dat$FR3.IMGT.Nb.of.mutations - -dat$VRegionNucleotides = dat$CDR1.IMGT.Nb.of.nucleotides + - dat$FR2.IMGT.Nb.of.nucleotides + - dat$CDR2.IMGT.Nb.of.nucleotides + - dat$FR3.IMGT.Nb.of.nucleotides +regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") +if(!include_fr1){ + regions = c("CDR1", "FR2", "CDR2", "FR3") +} -dat$transitionMutations = dat$CDR1.IMGT.a.g + - dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$CDR1.IMGT.t.c + - dat$FR2.IMGT.a.g + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$FR2.IMGT.t.c + - dat$CDR2.IMGT.a.g + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$CDR2.IMGT.t.c + - dat$FR3.IMGT.a.g + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t + - dat$FR3.IMGT.t.c +sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } -dat$transversionMutations = dat$CDR1.IMGT.a.c + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.a.t + - dat$CDR1.IMGT.t.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$CDR1.IMGT.t.g + - dat$FR2.IMGT.a.c + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.a.t + - dat$FR2.IMGT.t.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$FR2.IMGT.t.g + - dat$CDR2.IMGT.a.c + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.a.t + - dat$CDR2.IMGT.t.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$CDR2.IMGT.t.g + - dat$FR3.IMGT.a.c + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.a.t + - dat$FR3.IMGT.t.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t + - dat$FR3.IMGT.t.g +VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") +dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) + +VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") +dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns) + +transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="") +dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) + +transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") +dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) -dat$transitionMutationsAtGC = dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t +transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") +dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) -dat$totalMutationsAtGC = dat$CDR1.IMGT.g.a + - dat$CDR1.IMGT.c.t + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$FR2.IMGT.g.a + - dat$FR2.IMGT.c.t + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$CDR2.IMGT.g.a + - dat$CDR2.IMGT.c.t + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t +totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t"), sep="") +dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) setwd(outputdir) +nts = c("a", "t", "g", "c") +zeros=rep(0, 4) matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5) for(i in 1:length(genes)){ gene = genes[i] @@ -208,10 +143,6 @@ if(gene == "."){ tmp = dat } - if(length(tmp) == 0){ - cat("0", file=paste(gene, "_value.txt" ,sep="")) - next - } j = i - 1 x = (j * 3) + 1 y = (j * 3) + 2 @@ -232,33 +163,36 @@ matrx[5,y] = sum(tmp$VRegionMutations) matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) - transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) + transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) row.names(transitionTable) = c("A", "C", "G", "T") transitionTable["A","A"] = NA transitionTable["C","C"] = NA transitionTable["G","G"] = NA transitionTable["T","T"] = NA - nts = c("a", "c", "g", "t") + + if(nrow(tmp) > 0){ + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } + } + } + } - for(nt1 in nts){ - for(nt2 in nts){ - if(nt1 == nt2){ - next - } - NT1 = LETTERS[letters == nt1] - NT2 = LETTERS[letters == nt2] - FR1 = 0 #paste("FR1.IMGT.", nt1, ".", nt2, sep="") - CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") - FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") - CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") - FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( tmp[,CDR1] + - tmp[,FR2] + - tmp[,CDR2] + - tmp[,FR3]) - } - } write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) @@ -294,7 +228,6 @@ transitionTable["C","C"] = NA transitionTable["G","G"] = NA transitionTable["T","T"] = NA -nts = c("a", "c", "g", "t") for(nt1 in nts){ @@ -309,10 +242,11 @@ FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( tmp[,CDR1] + - tmp[,FR2] + - tmp[,CDR2] + - tmp[,FR3]) + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } } } write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) @@ -382,3 +316,45 @@ print(pc) dev.off() } + +dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) + +p = ggplot(dat, aes(best_match, percentage_mutations))# + scale_y_log10(breaks=scales,labels=scales) +p = p + geom_point(aes(colour=best_match), position="jitter") +p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + +png(filename="scatter.png") +print(p) +dev.off() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
--- a/mutation_analysis.xml Thu Apr 02 03:31:23 2015 -0400 +++ b/mutation_analysis.xml Tue Apr 07 03:52:34 2015 -0400 @@ -1,7 +1,7 @@ <tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0"> <description></description> <command interpreter="bash"> - wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} + wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} ${include_fr1} </command> <inputs> <param name="in_file" type="data" label="IMGT zip file to be analysed" /> @@ -9,6 +9,10 @@ <option value="custom" selected="true">custom</option> <option value="blastn">blastn</option> </param> + <param name="include_fr1" type="select" label="Include mutations in FR1 region" help="" > + <option value="yes">yes</option> + <option value="no" selected="true">no</option> + </param> </inputs> <outputs> <data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/>
--- a/wrapper.sh Thu Apr 02 03:31:23 2015 -0400 +++ b/wrapper.sh Tue Apr 07 03:52:34 2015 -0400 @@ -6,6 +6,7 @@ output=$3 outdir=$4 title=$5 +include_fr1=$6 mkdir $outdir unzip $input -d $PWD/files/ > $PWD/unziplog.log @@ -14,7 +15,7 @@ cat $PWD/files/*/8_* > $PWD/mutationstats.txt cat $PWD/files/*/10_* > $PWD/hotspots.txt -#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" +BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" echo "${BLASTN_DIR}" @@ -46,7 +47,7 @@ genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm" echo "R mutation analysis" -Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir 2>&1 +Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1 echo "python mutation analysis" python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt @@ -82,6 +83,10 @@ then echo "<img src='cg.png'/><br />" >> $output fi +if [ -a $outdir/scatter.png ] +then + echo "<img src='scatter.png'/><br />" >> $output +fi for gene in ${genes[@]} do