# HG changeset patch # User davidvanzessen # Date 1444388297 14400 # Node ID 17e677c72e49cbc60dbdaa89d1f44d46cc5a4607 # Parent 7dd7cefcf72d33724cee0a4134fafc766382eab3 Uploaded diff -r 7dd7cefcf72d -r 17e677c72e49 RScript.r --- a/RScript.r Thu Oct 08 10:07:28 2015 -0400 +++ b/RScript.r Fri Oct 09 06:58:17 2015 -0400 @@ -65,6 +65,9 @@ single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0)) +patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... +patient.merge.list.second = list() + patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ if (!is.data.frame(x) & is.list(x)){ x = x[[1]] @@ -110,7 +113,7 @@ if(appendtxt){ cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) } - cat(paste("", patient, "", sep=""), file=logfile, append=T) + cat(paste("", patient, "", sep=""), file=logfile, append=T) if(mergeOn == "Clone_Sequence"){ patient1$merge = paste(patient1$Clone_Sequence) @@ -131,11 +134,19 @@ cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence - - #fuzzy matching here... - if(mergeOn == "Clone_Sequence"){ + start.time = proc.time() + merge.list = c() + + if(patient %in% names(patient.merge.list)){ + patientMerge = patient.merge.list[[patient]] + merge.list[["second"]] = patient.merge.list.second[[patient]] + cat(paste("", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)", sep=""), file=logfile, append=T) + + print(names(patient.merge.list)) + } else { + #fuzzy matching here... #merge.list = patientMerge$merge - + #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] @@ -144,19 +155,19 @@ #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) - + #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence) #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence) - + patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J) patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J) - + #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove? #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] - + #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] - + patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy) patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),] @@ -170,15 +181,15 @@ first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] first.sample = patient.fuzzy[1,"Sample"] merge.filter = first.merge == patient.fuzzy$merge - + #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9 - + first.sample.filter = first.sample == patient.fuzzy$Sample second.sample.filter = first.sample != patient.fuzzy$Sample #first match same sample, sum to a single row, same for other sample #then merge rows like 'normal' - + sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence) @@ -266,9 +277,13 @@ patient.fuzzy = patient.fuzzy[-1,] } } - + patient.merge.list[[patient]] <<- patientMerge + patient.merge.list.second[[patient]] <<- merge.list[["second"]] + cat(paste("", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)", sep=""), file=logfile, append=T) } - + + print(names(patient.merge.list)) + patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) res1 = vector()