Mercurial > repos > davidvanzessen > mutation_analysis
changeset 123:0453ea4d9f14 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 22 Aug 2016 07:00:23 -0400 |
parents | 3d64b3efd352 |
children | 4a93146f87aa |
files | merge_and_filter.r mutation_analysis.r mutation_analysis.xml pattern_plots.r wrapper.sh |
diffstat | 5 files changed, 50 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- a/merge_and_filter.r Wed Aug 17 08:29:07 2016 -0400 +++ b/merge_and_filter.r Mon Aug 22 07:00:23 2016 -0400 @@ -91,30 +91,30 @@ result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with mutation analysis:", nrow(result))) +print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with mutation stats:", nrow(result))) +print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with hotspots:", nrow(result))) +print(paste("Number of sequences after merging with hotspots file:", nrow(result))) + +sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] +names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") +result = merge(result, sequences, by="Sequence.ID", all.x=T) + +print(paste("Number of sequences in result after merging with sequences:", nrow(result))) -#result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match) -if(unique.type == "AA.JUNCTION_V_subclass"){ - result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match) -} else if (unique.type == "AA.JUNCTION_subclass"){ - result$past = paste(result$AA.JUNCTION, result$best_match) -} else if (unique.type == "V_subclass"){ - result$past = paste(result$VGene, result$best_match) -} else if (unique.type == "AA.JUNCTION_V"){ - result$past = paste(result$AA.JUNCTION, result$VGene) -} else if (unique.type == "AA.JUNCTION"){ - result$past = paste(result$AA.JUNCTION) -} else { - result$past = 1:nrow(result) -} +result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) +result$VGene = gsub("[*].*", "", result$VGene) +result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) +result$DGene = gsub("[*].*", "", result$DGene) +result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) +result$JGene = gsub("[*].*", "", result$JGene) + +result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) result = result[!(duplicated(result$past)), ] @@ -124,13 +124,6 @@ filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) -#remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions. -sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] -names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") -result = merge(result, sequences, by="Sequence.ID", all.x=T) - -print(paste("Number of sequences in result after merging with sequences:", nrow(result))) - print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) @@ -167,11 +160,6 @@ result[is.na(result[,col]),] = 0 } -result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) -result$VGene = gsub("[*].*", "", result$VGene) -result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) -result$JGene = gsub("[*].*", "", result$JGene) - write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) if(filter.unique != "no"){
--- a/mutation_analysis.r Wed Aug 17 08:29:07 2016 -0400 +++ b/mutation_analysis.r Mon Aug 22 07:00:23 2016 -0400 @@ -231,11 +231,11 @@ if(fname == "sum"){ matrx[10,x] = round(f(rowSums(tmp[,c("FR2.IMGT.Nb.of.nucleotides", "FR3.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) - matrx[10,z] = round(matrx[10,x] / matrx[10,y], digits=1) + matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1) matrx[11,x] = round(f(rowSums(tmp[,c("CDR1.IMGT.Nb.of.nucleotides", "CDR2.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) - matrx[11,z] = round(matrx[11,x] / matrx[11,y], digits=1) + matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1) } } @@ -338,9 +338,9 @@ result = data.frame(matrx) if(fname == "sum"){ - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") } else { - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") } write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
--- a/mutation_analysis.xml Wed Aug 17 08:29:07 2016 -0400 +++ b/mutation_analysis.xml Mon Aug 22 07:00:23 2016 -0400 @@ -25,11 +25,16 @@ <option value="no" selected="true">No</option> </param> <param name="unique" type="select" label="Remove duplicates based on" help="" > - <option value="AA.JUNCTION_V_subclass" selected="true">Top.V.Gene, CDR3.Seq, C region</option> - <option value="AA.JUNCTION_subclass">CDR3.Seq, C region</option> - <option value="AA.JUNCTION_V">CDR3.seq, Top.V.Gene</option> - <option value="AA.JUNCTION">CDR3.seq</option> - <option value="none">Don't remove duplicates</option> + <option value="VGene,AA.JUNCTION,best_match" selected="true">Top.V.Gene, CDR3.AA.Seq, C region</option> + <option value="VGene,AA.JUNCTION">Top.V.Gene, CDR3.AA.Seq</option> + <option value="AA.JUNCTION,best_match">CDR3.AA.Seq, C region</option> + <option value="AA.JUNCTION">CDR3.AA.Seq</option> + + <option value="VGene,CDR3.IMGT.seq,best_match" selected="true">Top.V.Gene, CDR3.nt.Seq, C region</option> + <option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3.nt.Seq</option> + <option value="CDR3.IMGT.seq,best_match">CDR3.nt.Seq, C region</option> + <option value="CDR3.IMGT.seq">CDR3.nt.Seq</option> + <option value="Sequence.ID">Don't remove duplicates</option> </param> <param name="class_filter" type="select" label="Class/Subclass filter" help="" > <option value="70_70" selected="true">>70% class and >70% subclass</option>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pattern_plots.r Mon Aug 22 07:00:23 2016 -0400 @@ -0,0 +1,17 @@ +library(ggplot2) +library(reshape2) + +args <- commandArgs(trailingOnly = TRUE) + +input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt" +plot1.file = args[2] +plot2.file = args[3] +plot3.file = args[4] + +dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T) + +classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm") +xyz = c("x", "y", "z") + +names(dat) = c("info", paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep=".")) +
--- a/wrapper.sh Wed Aug 17 08:29:07 2016 -0400 +++ b/wrapper.sh Mon Aug 22 07:00:23 2016 -0400 @@ -236,7 +236,7 @@ echo "---------------- $func table ----------------<br />" >> $log cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt - + echo "<table class='pure-table pure-table-striped'>" >> $output echo "<thead><tr><th>info</th>" >> $output for gene in ${genes[@]} @@ -360,6 +360,7 @@ echo "<table class='pure-table pure-table-striped'>" >> $output echo "<thead><tr><th>info</th><th>link</th></tr></thead>" >> $output echo "<tr><td>The complete dataset</td><td><a href='merged.txt'>Download</a></td></tr>" >> $output +echo "<tr><td>The SHM Overview table as a dataset</td><td><a href='data_sum.txt'>Download</a></td></tr>" >> $output echo "<tr><td>The alignment info on the unmatched sequences</td><td><a href='unmatched.txt'>Download</a></td></tr>" >> $output echo "<tr><td>Motif data per sequence ID</td><td><a href='motif_per_seq.txt'>Download</a></td></tr>" >> $output echo "<tr><td>Mutation data per sequence ID</td><td><a href='mutation_by_id.txt'>Download</a></td></tr>" >> $output