Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

diff RScript.r @ 68:ef13f0a3f4d6 draft
Uploaded
author: davidvanzessen
date: Tue, 05 Jan 2016 10:49:40 -0500
parents: 40c72b9ffc79
children: c532b3f8dc97
--- a/RScript.r	Fri Nov 20 11:41:30 2015 -0500
+++ b/RScript.r	Tue Jan 05 10:49:40 2016 -0500
@@ -1,4 +1,5 @@
 args <- commandArgs(trailingOnly = TRUE)
+options(scipen=999)
 
 inFile = args[1]
 outDir = args[2]
@@ -67,6 +68,7 @@
 
 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
 patient.merge.list.second = list()
+  scatter_locus_data_list = list()
 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
@@ -125,10 +127,15 @@
   }
   
   scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
-  scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
-  scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
-  scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
-  scatterplot_data$on = onShort
+  #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
+  scatterplot_data = patient1[NULL,scatterplot_data_columns]
+  #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
+  #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
+  scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
+  #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
+  scatterplot_data$type = character(0)
+  scatterplot_data$link = numeric(0)
+  scatterplot_data$on = character(0)
   
   #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
   patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
@@ -141,6 +148,7 @@
   if(patient %in% names(patient.merge.list)){
     patientMerge = patient.merge.list[[patient]]
     merge.list[["second"]] = patient.merge.list.second[[patient]]
+    scatterplot_data = scatter_locus_data_list[[patient]]
     cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
 
     print(names(patient.merge.list))
@@ -175,7 +183,9 @@
     merge.list = list()
 
     merge.list[["second"]] = vector()
-
+	
+	link.count = 1
+	
     while(nrow(patient.fuzzy) > 1){
       first.merge = patient.fuzzy[1,"merge"]
       first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
@@ -264,7 +274,24 @@
 
         tmp.rows = rbind(first.rows, second.rows)
         tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
-
+        
+        
+        #add to the scatterplot data
+        scatterplot.row = first.sum[,scatterplot_data_columns]
+		scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
+		scatterplot.row$link = link.count
+		scatterplot.row$on = onShort
+		
+		scatterplot_data = rbind(scatterplot_data, scatterplot.row)
+        
+        scatterplot.row = second.sum[,scatterplot_data_columns]
+		scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
+		scatterplot.row$link = link.count
+		scatterplot.row$on = onShort
+		
+		scatterplot_data = rbind(scatterplot_data, scatterplot.row)    
+		
+		#write some information about the match to a log file
         if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
           cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
         } else {
@@ -289,14 +316,32 @@
         merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
 
         patient.fuzzy = patient.fuzzy[-first.match.filter,]
+        
+        #add to the scatterplot data
+        scatterplot.row = first.sum[,scatterplot_data_columns]
+		scatterplot.row$type = first.sum[,"Sample"]
+		scatterplot.row$link = link.count
+		scatterplot.row$on = onShort
+		
+		scatterplot_data = rbind(scatterplot_data, scatterplot.row)
 
         cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
       } else {
         patient.fuzzy = patient.fuzzy[-1,]
+        
+        #add to the scatterplot data
+        scatterplot.row = first.sum[,scatterplot_data_columns]
+		scatterplot.row$type = first.sum[,"Sample"]
+		scatterplot.row$link = link.count
+		scatterplot.row$on = onShort
+		
+		scatterplot_data = rbind(scatterplot_data, scatterplot.row)
       }
+      link.count = link.count + 1    
     }
     patient.merge.list[[patient]] <<- patientMerge
     patient.merge.list.second[[patient]] <<- merge.list[["second"]]
+    scatter_locus_data_list[[patient]] <<- scatterplot_data
     cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
   }
 
@@ -305,6 +350,7 @@
 
   
   patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
+  #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
   res1 = vector()
   res2 = vector()
   resBoth = vector()
@@ -346,33 +392,42 @@
     } else {
       scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
       #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
-      scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
+      #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
       if(nrow(scatterplot_locus_data) > 0){
         scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
       }
-      in_one = (scatterplot_locus_data$merge %in% patient1$merge)
-      in_two = (scatterplot_locus_data$merge %in% patient2$merge)
-      if(any(in_two)){
-        scatterplot_locus_data[in_two,]$type = twoSample
-      }
-      in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
-      #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
-      #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
-      if(any(in_both)){
-        scatterplot_locus_data[in_both,]$type = "In Both"
-      }
-      if(type == "single"){
-        single_patients <<- rbind(single_patients, scatterplot_locus_data)
-      }
+      
+      
+      
+      #in_one = (scatterplot_locus_data$merge %in% patient1$merge)
+      #in_two = (scatterplot_locus_data$merge %in% patient2$merge)
+      #if(any(in_two)){
+      #  scatterplot_locus_data[in_two,]$type = twoSample
+      #}
+      #in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
+      ##merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
+      ##exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
+      #if(any(in_both)){
+      #  scatterplot_locus_data[in_both,]$type = "In Both"
+      #}
+      #if(type == "single" & (nrow(scatterplot_locus_data) > 0 | !any(scatterplot_locus_data$Patient %in% single_patients$Patient))){
+      #  single_patients <<- rbind(single_patients, scatterplot_locus_data)
+      #}
+      
+      
       p = NULL
+      print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
       if(nrow(scatterplot_locus_data) != 0){
         if(on == "normalized_read_count"){
           scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
-          p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
+          #p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
+          p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
         } else {
-          p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
+          #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
+          p = ggplot(scatterplot_locus_data, aes(type, Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
         }
-        p = p + geom_point(aes(colour=type), position="jitter")
+        #p = p + geom_point(aes(colour=type), position="jitter")
+        p = p + geom_point(aes(colour=type), position="dodge")
         p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
       } else {
         p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
@@ -453,7 +508,7 @@
 
 if(nrow(single_patients) > 0){
 	scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
-	p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
+	p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
 	p = p + geom_point(aes(colour=type), position="jitter")
 	p = p + xlab("In one or both samples") + ylab("Reads")
 	p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
@@ -461,7 +516,8 @@
 	print(p)
 	dev.off()
 
-	p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
+	#p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
+	p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
 	p = p + geom_point(aes(colour=type), position="jitter")
 	p = p + xlab("In one or both samples") + ylab("Frequency")
 	p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
@@ -762,6 +818,11 @@
   patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
   patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
 
+  #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
+  #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
+  #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
+  #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
+
   patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
   patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
   patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
@@ -882,7 +943,8 @@
 					scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
           p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
         } else {
-          p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
+          p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
+          #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
         }
         p = p + geom_point(aes(colour=type), position="jitter")
         p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
@@ -1027,4 +1089,4 @@
 } else {
   cat("", file="triplets.txt")
 }
-cat("</table></html>", file=logfile, append=T)
\ No newline at end of file
+cat("</table></html>", file=logfile, append=T)
author	davidvanzessen
date	Tue, 05 Jan 2016 10:49:40 -0500
parents	40c72b9ffc79
children	c532b3f8dc97