Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
diff RScript.r @ 68:ef13f0a3f4d6 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 05 Jan 2016 10:49:40 -0500 |
parents | 40c72b9ffc79 |
children | c532b3f8dc97 |
line wrap: on
line diff
--- a/RScript.r Fri Nov 20 11:41:30 2015 -0500 +++ b/RScript.r Tue Jan 05 10:49:40 2016 -0500 @@ -1,4 +1,5 @@ args <- commandArgs(trailingOnly = TRUE) +options(scipen=999) inFile = args[1] outDir = args[2] @@ -67,6 +68,7 @@ patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... patient.merge.list.second = list() + scatter_locus_data_list = list() cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T) cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T) patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ @@ -125,10 +127,15 @@ } scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge") - scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns]) - scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] - scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) - scatterplot_data$on = onShort + #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns]) + scatterplot_data = patient1[NULL,scatterplot_data_columns] + #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] + #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) + scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")) + #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor) + scatterplot_data$type = character(0) + scatterplot_data$link = numeric(0) + scatterplot_data$on = character(0) #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy' patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh @@ -141,6 +148,7 @@ if(patient %in% names(patient.merge.list)){ patientMerge = patient.merge.list[[patient]] merge.list[["second"]] = patient.merge.list.second[[patient]] + scatterplot_data = scatter_locus_data_list[[patient]] cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T) print(names(patient.merge.list)) @@ -175,7 +183,9 @@ merge.list = list() merge.list[["second"]] = vector() - + + link.count = 1 + while(nrow(patient.fuzzy) > 1){ first.merge = patient.fuzzy[1,"merge"] first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] @@ -264,7 +274,24 @@ tmp.rows = rbind(first.rows, second.rows) tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),] - + + + #add to the scatterplot data + scatterplot.row = first.sum[,scatterplot_data_columns] + scatterplot.row$type = paste(first.sum[,"Sample"], "In Both") + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) + + scatterplot.row = second.sum[,scatterplot_data_columns] + scatterplot.row$type = paste(second.sum[,"Sample"], "In Both") + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) + + #write some information about the match to a log file if (nrow(first.rows) > 1 | nrow(second.rows) > 1) { cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T) } else { @@ -289,14 +316,32 @@ merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) patient.fuzzy = patient.fuzzy[-first.match.filter,] + + #add to the scatterplot data + scatterplot.row = first.sum[,scatterplot_data_columns] + scatterplot.row$type = first.sum[,"Sample"] + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T) } else { patient.fuzzy = patient.fuzzy[-1,] + + #add to the scatterplot data + scatterplot.row = first.sum[,scatterplot_data_columns] + scatterplot.row$type = first.sum[,"Sample"] + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) } + link.count = link.count + 1 } patient.merge.list[[patient]] <<- patientMerge patient.merge.list.second[[patient]] <<- merge.list[["second"]] + scatter_locus_data_list[[patient]] <<- scatterplot_data cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T) } @@ -305,6 +350,7 @@ patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) + #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony]) res1 = vector() res2 = vector() resBoth = vector() @@ -346,33 +392,42 @@ } else { scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),] - scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] + #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] if(nrow(scatterplot_locus_data) > 0){ scatterplot_locus_data$Rearrangement = product[iter, titleIndex] } - in_one = (scatterplot_locus_data$merge %in% patient1$merge) - in_two = (scatterplot_locus_data$merge %in% patient2$merge) - if(any(in_two)){ - scatterplot_locus_data[in_two,]$type = twoSample - } - in_both = (scatterplot_locus_data$merge %in% patientMerge$merge) - #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]]) - #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches) - if(any(in_both)){ - scatterplot_locus_data[in_both,]$type = "In Both" - } - if(type == "single"){ - single_patients <<- rbind(single_patients, scatterplot_locus_data) - } + + + + #in_one = (scatterplot_locus_data$merge %in% patient1$merge) + #in_two = (scatterplot_locus_data$merge %in% patient2$merge) + #if(any(in_two)){ + # scatterplot_locus_data[in_two,]$type = twoSample + #} + #in_both = (scatterplot_locus_data$merge %in% patientMerge$merge) + ##merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]]) + ##exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches) + #if(any(in_both)){ + # scatterplot_locus_data[in_both,]$type = "In Both" + #} + #if(type == "single" & (nrow(scatterplot_locus_data) > 0 | !any(scatterplot_locus_data$Patient %in% single_patients$Patient))){ + # single_patients <<- rbind(single_patients, scatterplot_locus_data) + #} + + p = NULL + print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data))) if(nrow(scatterplot_locus_data) != 0){ if(on == "normalized_read_count"){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) - p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + #p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) } else { - p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + p = ggplot(scatterplot_locus_data, aes(type, Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) } - p = p + geom_point(aes(colour=type), position="jitter") + #p = p + geom_point(aes(colour=type), position="jitter") + p = p + geom_point(aes(colour=type), position="dodge") p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) } else { p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) @@ -453,7 +508,7 @@ if(nrow(single_patients) > 0){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) - p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) + p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000)) p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or both samples") + ylab("Reads") p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample") @@ -461,7 +516,8 @@ print(p) dev.off() - p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or both samples") + ylab("Frequency") p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample") @@ -762,6 +818,11 @@ patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) + #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony]) + #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony]) + #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony]) + patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),] patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),] patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),] @@ -882,7 +943,8 @@ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) } else { - p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) + #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) } p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex])) @@ -1027,4 +1089,4 @@ } else { cat("", file="triplets.txt") } -cat("</table></html>", file=logfile, append=T) \ No newline at end of file +cat("</table></html>", file=logfile, append=T)