changeset 123:0453ea4d9f14 draft

Uploaded
author davidvanzessen
date Mon, 22 Aug 2016 07:00:23 -0400
parents 3d64b3efd352
children 4a93146f87aa
files merge_and_filter.r mutation_analysis.r mutation_analysis.xml pattern_plots.r wrapper.sh
diffstat 5 files changed, 50 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Wed Aug 17 08:29:07 2016 -0400
+++ b/merge_and_filter.r	Mon Aug 22 07:00:23 2016 -0400
@@ -91,30 +91,30 @@
 
 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
 
-print(paste("Number of sequences after merging with mutation analysis:", nrow(result)))
+print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
 
 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
 
-print(paste("Number of sequences after merging with mutation stats:", nrow(result)))
+print(paste("Number of sequences after merging with mutation stats file:", nrow(result)))
 
 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
 
-print(paste("Number of sequences after merging with hotspots:", nrow(result)))
+print(paste("Number of sequences after merging with hotspots file:", nrow(result)))
+
+sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
+names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
+result = merge(result, sequences, by="Sequence.ID", all.x=T)
+
+print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
 
-#result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
-if(unique.type == "AA.JUNCTION_V_subclass"){
-	result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match)
-} else if (unique.type == "AA.JUNCTION_subclass"){
-	result$past = paste(result$AA.JUNCTION, result$best_match)
-} else if (unique.type == "V_subclass"){
-	result$past = paste(result$VGene, result$best_match)
-} else if (unique.type == "AA.JUNCTION_V"){
-	result$past = paste(result$AA.JUNCTION, result$VGene)
-} else if (unique.type == "AA.JUNCTION"){
-	result$past = paste(result$AA.JUNCTION)
-} else {
-	result$past = 1:nrow(result)
-}
+result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
+result$VGene = gsub("[*].*", "", result$VGene)
+result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele)
+result$DGene = gsub("[*].*", "", result$DGene)
+result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
+result$JGene = gsub("[*].*", "", result$JGene)
+
+result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
 
 result = result[!(duplicated(result$past)), ]
 
@@ -124,13 +124,6 @@
 
 filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result)))
 
-#remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions.
-sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
-names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
-result = merge(result, sequences, by="Sequence.ID", all.x=T)
-
-print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
-
 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
@@ -167,11 +160,6 @@
   result[is.na(result[,col]),] = 0
 }
 
-result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
-result$VGene = gsub("[*].*", "", result$VGene)
-result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
-result$JGene = gsub("[*].*", "", result$JGene)
-
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
 if(filter.unique != "no"){
--- a/mutation_analysis.r	Wed Aug 17 08:29:07 2016 -0400
+++ b/mutation_analysis.r	Mon Aug 22 07:00:23 2016 -0400
@@ -231,11 +231,11 @@
 		if(fname == "sum"){
 			matrx[10,x] = round(f(rowSums(tmp[,c("FR2.IMGT.Nb.of.nucleotides", "FR3.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1)
 			matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
-			matrx[10,z] = round(matrx[10,x] / matrx[10,y], digits=1)
+			matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1)
 
 			matrx[11,x] = round(f(rowSums(tmp[,c("CDR1.IMGT.Nb.of.nucleotides", "CDR2.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1)
 			matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
-			matrx[11,z] = round(matrx[11,x] / matrx[11,y], digits=1)
+			matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1)
 		}
 	}
   
@@ -338,9 +338,9 @@
 	
 	result = data.frame(matrx)
 	if(fname == "sum"){
-		row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR")
+		row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR")
 	} else {
-		row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
+		row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
 	}
 
 	write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
--- a/mutation_analysis.xml	Wed Aug 17 08:29:07 2016 -0400
+++ b/mutation_analysis.xml	Mon Aug 22 07:00:23 2016 -0400
@@ -25,11 +25,16 @@
 			<option value="no" selected="true">No</option>
 		</param>
 		<param name="unique" type="select" label="Remove duplicates based on" help="" >
-			<option value="AA.JUNCTION_V_subclass" selected="true">Top.V.Gene, CDR3.Seq, C region</option>
-			<option value="AA.JUNCTION_subclass">CDR3.Seq, C region</option>
-			<option value="AA.JUNCTION_V">CDR3.seq, Top.V.Gene</option>
-			<option value="AA.JUNCTION">CDR3.seq</option>
-			<option value="none">Don't remove duplicates</option>
+			<option value="VGene,AA.JUNCTION,best_match" selected="true">Top.V.Gene, CDR3.AA.Seq, C region</option>			
+			<option value="VGene,AA.JUNCTION">Top.V.Gene, CDR3.AA.Seq</option>
+			<option value="AA.JUNCTION,best_match">CDR3.AA.Seq, C region</option>
+			<option value="AA.JUNCTION">CDR3.AA.Seq</option>
+			
+			<option value="VGene,CDR3.IMGT.seq,best_match" selected="true">Top.V.Gene, CDR3.nt.Seq, C region</option>			
+			<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3.nt.Seq</option>
+			<option value="CDR3.IMGT.seq,best_match">CDR3.nt.Seq, C region</option>
+			<option value="CDR3.IMGT.seq">CDR3.nt.Seq</option>
+			<option value="Sequence.ID">Don't remove duplicates</option>
 		</param>
 		<param name="class_filter" type="select" label="Class/Subclass filter" help="" >
 			<option value="70_70" selected="true">>70% class and >70% subclass</option>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pattern_plots.r	Mon Aug 22 07:00:23 2016 -0400
@@ -0,0 +1,17 @@
+library(ggplot2)
+library(reshape2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt"
+plot1.file = args[2] 
+plot2.file = args[3]
+plot3.file = args[4]
+
+dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T)
+
+classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm")
+xyz = c("x", "y", "z")
+
+names(dat) = c("info", paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
+
--- a/wrapper.sh	Wed Aug 17 08:29:07 2016 -0400
+++ b/wrapper.sh	Mon Aug 22 07:00:23 2016 -0400
@@ -236,7 +236,7 @@
 	echo "---------------- $func table ----------------<br />" >> $log
 	
 	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
-
+	
 	echo "<table class='pure-table pure-table-striped'>" >> $output
 	echo "<thead><tr><th>info</th>" >> $output
 	for gene in ${genes[@]}
@@ -360,6 +360,7 @@
 echo "<table class='pure-table pure-table-striped'>" >> $output
 echo "<thead><tr><th>info</th><th>link</th></tr></thead>" >> $output
 echo "<tr><td>The complete dataset</td><td><a href='merged.txt'>Download</a></td></tr>" >> $output
+echo "<tr><td>The SHM Overview table as a dataset</td><td><a href='data_sum.txt'>Download</a></td></tr>" >> $output
 echo "<tr><td>The alignment info on the unmatched sequences</td><td><a href='unmatched.txt'>Download</a></td></tr>" >> $output
 echo "<tr><td>Motif data per sequence ID</td><td><a href='motif_per_seq.txt'>Download</a></td></tr>" >> $output
 echo "<tr><td>Mutation data per sequence ID</td><td><a href='mutation_by_id.txt'>Download</a></td></tr>" >> $output