# HG changeset patch # User davidvanzessen # Date 1471863623 14400 # Node ID 0453ea4d9f14dcff49372389406534308716eac3 # Parent 3d64b3efd352a89b8c83cf43145c27936d41df63 Uploaded diff -r 3d64b3efd352 -r 0453ea4d9f14 merge_and_filter.r --- a/merge_and_filter.r Wed Aug 17 08:29:07 2016 -0400 +++ b/merge_and_filter.r Mon Aug 22 07:00:23 2016 -0400 @@ -91,30 +91,30 @@ result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with mutation analysis:", nrow(result))) +print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with mutation stats:", nrow(result))) +print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") -print(paste("Number of sequences after merging with hotspots:", nrow(result))) +print(paste("Number of sequences after merging with hotspots file:", nrow(result))) + +sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] +names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") +result = merge(result, sequences, by="Sequence.ID", all.x=T) + +print(paste("Number of sequences in result after merging with sequences:", nrow(result))) -#result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match) -if(unique.type == "AA.JUNCTION_V_subclass"){ - result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match) -} else if (unique.type == "AA.JUNCTION_subclass"){ - result$past = paste(result$AA.JUNCTION, result$best_match) -} else if (unique.type == "V_subclass"){ - result$past = paste(result$VGene, result$best_match) -} else if (unique.type == "AA.JUNCTION_V"){ - result$past = paste(result$AA.JUNCTION, result$VGene) -} else if (unique.type == "AA.JUNCTION"){ - result$past = paste(result$AA.JUNCTION) -} else { - result$past = 1:nrow(result) -} +result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) +result$VGene = gsub("[*].*", "", result$VGene) +result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) +result$DGene = gsub("[*].*", "", result$DGene) +result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) +result$JGene = gsub("[*].*", "", result$JGene) + +result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) result = result[!(duplicated(result$past)), ] @@ -124,13 +124,6 @@ filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) -#remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions. -sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] -names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") -result = merge(result, sequences, by="Sequence.ID", all.x=T) - -print(paste("Number of sequences in result after merging with sequences:", nrow(result))) - print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) @@ -167,11 +160,6 @@ result[is.na(result[,col]),] = 0 } -result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) -result$VGene = gsub("[*].*", "", result$VGene) -result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) -result$JGene = gsub("[*].*", "", result$JGene) - write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) if(filter.unique != "no"){ diff -r 3d64b3efd352 -r 0453ea4d9f14 mutation_analysis.r --- a/mutation_analysis.r Wed Aug 17 08:29:07 2016 -0400 +++ b/mutation_analysis.r Mon Aug 22 07:00:23 2016 -0400 @@ -231,11 +231,11 @@ if(fname == "sum"){ matrx[10,x] = round(f(rowSums(tmp[,c("FR2.IMGT.Nb.of.nucleotides", "FR3.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) - matrx[10,z] = round(matrx[10,x] / matrx[10,y], digits=1) + matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1) matrx[11,x] = round(f(rowSums(tmp[,c("CDR1.IMGT.Nb.of.nucleotides", "CDR2.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) - matrx[11,z] = round(matrx[11,x] / matrx[11,y], digits=1) + matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1) } } @@ -338,9 +338,9 @@ result = data.frame(matrx) if(fname == "sum"){ - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") } else { - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") } write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) diff -r 3d64b3efd352 -r 0453ea4d9f14 mutation_analysis.xml --- a/mutation_analysis.xml Wed Aug 17 08:29:07 2016 -0400 +++ b/mutation_analysis.xml Mon Aug 22 07:00:23 2016 -0400 @@ -25,11 +25,16 @@ - - - - - + + + + + + + + + + diff -r 3d64b3efd352 -r 0453ea4d9f14 pattern_plots.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pattern_plots.r Mon Aug 22 07:00:23 2016 -0400 @@ -0,0 +1,17 @@ +library(ggplot2) +library(reshape2) + +args <- commandArgs(trailingOnly = TRUE) + +input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt" +plot1.file = args[2] +plot2.file = args[3] +plot3.file = args[4] + +dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T) + +classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm") +xyz = c("x", "y", "z") + +names(dat) = c("info", paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep=".")) + diff -r 3d64b3efd352 -r 0453ea4d9f14 wrapper.sh --- a/wrapper.sh Wed Aug 17 08:29:07 2016 -0400 +++ b/wrapper.sh Mon Aug 22 07:00:23 2016 -0400 @@ -236,7 +236,7 @@ echo "---------------- $func table ----------------
" >> $log cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt - + echo "" >> $output echo "" >> $output for gene in ${genes[@]} @@ -360,6 +360,7 @@ echo "
info
" >> $output echo "" >> $output echo "" >> $output +echo "" >> $output echo "" >> $output echo "" >> $output echo "" >> $output
infolink
The complete datasetDownload
The SHM Overview table as a datasetDownload
The alignment info on the unmatched sequencesDownload
Motif data per sequence IDDownload
Mutation data per sequence IDDownload