# HG changeset patch
# User davidvanzessen
# Date 1444388297 14400
# Node ID 17e677c72e49cbc60dbdaa89d1f44d46cc5a4607
# Parent 7dd7cefcf72d33724cee0a4134fafc766382eab3
Uploaded
diff -r 7dd7cefcf72d -r 17e677c72e49 RScript.r
--- a/RScript.r Thu Oct 08 10:07:28 2015 -0400
+++ b/RScript.r Fri Oct 09 06:58:17 2015 -0400
@@ -65,6 +65,9 @@
single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
+patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
+patient.merge.list.second = list()
+
patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
if (!is.data.frame(x) & is.list(x)){
x = x[[1]]
@@ -110,7 +113,7 @@
if(appendtxt){
cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
}
- cat(paste("
", patient, " |
", sep=""), file=logfile, append=T)
+ cat(paste("", patient, " | ", sep=""), file=logfile, append=T)
if(mergeOn == "Clone_Sequence"){
patient1$merge = paste(patient1$Clone_Sequence)
@@ -131,11 +134,19 @@
cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
-
- #fuzzy matching here...
- if(mergeOn == "Clone_Sequence"){
+ start.time = proc.time()
+ merge.list = c()
+
+ if(patient %in% names(patient.merge.list)){
+ patientMerge = patient.merge.list[[patient]]
+ merge.list[["second"]] = patient.merge.list.second[[patient]]
+ cat(paste("", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache) |
", sep=""), file=logfile, append=T)
+
+ print(names(patient.merge.list))
+ } else {
+ #fuzzy matching here...
#merge.list = patientMerge$merge
-
+
#patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
#patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
@@ -144,19 +155,19 @@
#patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
#patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
-
+
#patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
#patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
-
+
patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
-
+
#merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
#merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
-
+
#patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
#patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
-
+
patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
@@ -170,15 +181,15 @@
first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
first.sample = patient.fuzzy[1,"Sample"]
merge.filter = first.merge == patient.fuzzy$merge
-
+
#length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
-
+
first.sample.filter = first.sample == patient.fuzzy$Sample
second.sample.filter = first.sample != patient.fuzzy$Sample
#first match same sample, sum to a single row, same for other sample
#then merge rows like 'normal'
-
+
sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
@@ -266,9 +277,13 @@
patient.fuzzy = patient.fuzzy[-1,]
}
}
-
+ patient.merge.list[[patient]] <<- patientMerge
+ patient.merge.list.second[[patient]] <<- merge.list[["second"]]
+ cat(paste("", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s) | ", sep=""), file=logfile, append=T)
}
-
+
+ print(names(patient.merge.list))
+
patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
res1 = vector()