Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
diff RScript.r @ 60:28c3695259c1 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 29 Oct 2015 11:21:33 -0400 |
parents | 36e307208f1b |
children | 77a090cd0e02 |
line wrap: on
line diff
--- a/RScript.r Thu Oct 15 05:34:17 2015 -0400 +++ b/RScript.r Thu Oct 29 11:21:33 2015 -0400 @@ -442,17 +442,14 @@ interval = intervalFreq intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) -lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) +lapply(patients[4], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) interval = intervalReads intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) -lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") - -cat("</table></html>", file=logfile, append=T) - +lapply(patients[4], FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") if(nrow(single_patients) > 0){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) @@ -483,6 +480,10 @@ print(p) dev.off() } + +patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... +patient.merge.list.second = list() + tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ onShort = "reads" if(on == "Frequency"){ @@ -502,7 +503,7 @@ oneSample = paste(patient1[1,sampleIndex], sep="") twoSample = paste(patient2[1,sampleIndex], sep="") threeSample = paste(patient3[1,sampleIndex], sep="") - + if(mergeOn == "Clone_Sequence"){ patient1$merge = paste(patient1$Clone_Sequence) patient2$merge = paste(patient2$Clone_Sequence) @@ -513,18 +514,249 @@ patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) } - + + #patientMerge = merge(patient1, patient2, by="merge")[NULL,] + patient1.fuzzy = patient1 + patient2.fuzzy = patient2 + patient3.fuzzy = patient3 + + cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T) + + patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J) + patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J) + patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J) + + patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy) + + other.sample.list = list() + other.sample.list[[oneSample]] = c(twoSample, threeSample) + other.sample.list[[twoSample]] = c(oneSample, threeSample) + other.sample.list[[threeSample]] = c(oneSample, twoSample) + patientMerge = merge(patient1, patient2, by="merge") patientMerge = merge(patientMerge, patient3, by="merge") colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="") - patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + patientMerge = patientMerge[NULL,] + + duo.merge.list = list() + patientMerge12 = merge(patient1, patient2, by="merge") - patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) + #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) + patientMerge12 = patientMerge12[NULL,] + duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12 + duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12 + patientMerge13 = merge(patient1, patient3, by="merge") - patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) + #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) + patientMerge13 = patientMerge13[NULL,] + duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13 + duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13 + patientMerge23 = merge(patient2, patient3, by="merge") + #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) + patientMerge23 = patientMerge23[NULL,] + duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23 + duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23 + + merge.list = list() + merge.list[["second"]] = vector() + + start.time = proc.time() + if(paste(label1, "123") %in% names(patient.merge.list)){ + patientMerge = patient.merge.list[[paste(label1, "123")]] + patientMerge12 = patient.merge.list[[paste(label1, "12")]] + patientMerge13 = patient.merge.list[[paste(label1, "13")]] + patientMerge23 = patient.merge.list[[paste(label1, "23")]] + + merge.list[["second"]] = patient.merge.list.second[[label1]] + + cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T) + } else { + while(nrow(patient.fuzzy) > 0){ + first.merge = patient.fuzzy[1,"merge"] + first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] + first.sample = patient.fuzzy[1,"Sample"] + + merge.filter = first.merge == patient.fuzzy$merge + + second.sample = other.sample.list[[first.sample]][1] + third.sample = other.sample.list[[first.sample]][2] + + sample.filter.1 = first.sample == patient.fuzzy$Sample + sample.filter.2 = second.sample == patient.fuzzy$Sample + sample.filter.3 = third.sample == patient.fuzzy$Sample + + sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence) + + match.filter.1 = sample.filter.1 & sequence.filter & merge.filter + match.filter.2 = sample.filter.2 & sequence.filter & merge.filter + match.filter.3 = sample.filter.3 & sequence.filter & merge.filter + + matches.in.1 = any(match.filter.1) + matches.in.2 = any(match.filter.2) + matches.in.3 = any(match.filter.3) + + + + rows.1 = patient.fuzzy[match.filter.1,] + + sum.1 = data.frame(merge = first.clone.sequence, + Patient = label1, + Receptor = rows.1[1,"Receptor"], + Sample = rows.1[1,"Sample"], + Cell_Count = rows.1[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes), + Log10_Frequency = log10(sum(rows.1$Frequency)), + Total_Read_Count = sum(rows.1$Total_Read_Count), + dsPerM = sum(rows.1$dsPerM), + J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"], + Clone_Sequence = first.clone.sequence, + CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone = F, + Frequency = sum(rows.1$Frequency), + locus_V = rows.1[1,"locus_V"], + locus_J = rows.1[1,"locus_J"], + normalized_read_count = sum(rows.1$normalized_read_count)) + sum.2 = sum.1[NULL,] + rows.2 = patient.fuzzy[match.filter.2,] + if(matches.in.2){ + sum.2 = data.frame(merge = first.clone.sequence, + Patient = label1, + Receptor = rows.2[1,"Receptor"], + Sample = rows.2[1,"Sample"], + Cell_Count = rows.2[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes), + Log10_Frequency = log10(sum(rows.2$Frequency)), + Total_Read_Count = sum(rows.2$Total_Read_Count), + dsPerM = sum(rows.2$dsPerM), + J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"], + Clone_Sequence = first.clone.sequence, + CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone = F, + Frequency = sum(rows.2$Frequency), + locus_V = rows.2[1,"locus_V"], + locus_J = rows.2[1,"locus_J"], + normalized_read_count = sum(rows.2$normalized_read_count)) + } + + sum.3 = sum.1[NULL,] + rows.3 = patient.fuzzy[match.filter.3,] + if(matches.in.3){ + sum.3 = data.frame(merge = first.clone.sequence, + Patient = label1, + Receptor = rows.3[1,"Receptor"], + Sample = rows.3[1,"Sample"], + Cell_Count = rows.3[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes), + Log10_Frequency = log10(sum(rows.3$Frequency)), + Total_Read_Count = sum(rows.3$Total_Read_Count), + dsPerM = sum(rows.3$dsPerM), + J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"], + Clone_Sequence = first.clone.sequence, + CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone = F, + Frequency = sum(rows.3$Frequency), + locus_V = rows.3[1,"locus_V"], + locus_J = rows.3[1,"locus_J"], + normalized_read_count = sum(rows.3$normalized_read_count)) + } + + if(matches.in.2 & matches.in.3){ + merge.123 = merge(sum.1, sum.2, by="merge") + merge.123 = merge(merge.123, sum.3, by="merge") + colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="") + #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz]) + + patientMerge = rbind(patientMerge, merge.123) + patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),] + + hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) + merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) + + } else if (matches.in.2) { + #other.sample1 = other.sample.list[[first.sample]][1] + #other.sample2 = other.sample.list[[first.sample]][2] + + second.sample = sum.2[,"Sample"] + + current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]] + + merge.12 = merge(sum.1, sum.2, by="merge") + + current.merge.list = rbind(current.merge.list, merge.12) + duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list + + patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),] + + hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) + merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) + + } else if (matches.in.3) { + + #other.sample1 = other.sample.list[[first.sample]][1] + #other.sample2 = other.sample.list[[first.sample]][2] + + second.sample = sum.3[,"Sample"] + + current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]] + + merge.13 = merge(sum.1, sum.3, by="merge") + + current.merge.list = rbind(current.merge.list, merge.13) + duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list + + patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),] + + hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) + merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) + + } else { + patient.fuzzy = patient.fuzzy[-1,] + } + + tmp.rows = rbind(rows.1, rows.2, rows.3) + tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),] + + if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) { + cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T) + } else { + } + + } + patient.merge.list[[paste(label1, "123")]] = patientMerge + + patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]] + patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]] + patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]] + + patient.merge.list[[paste(label1, "12")]] = patientMerge12 + patient.merge.list[[paste(label1, "13")]] = patientMerge13 + patient.merge.list[[paste(label1, "23")]] = patientMerge23 + + patient.merge.list.second[[label1]] = merge.list[["second"]] + } + cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T) + patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) + patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) - + + if(F){ + patientMerge = merge(patient1, patient2, by="merge") + patientMerge = merge(patientMerge, patient3, by="merge") + colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="") + patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + patientMerge12 = merge(patient1, patient2, by="merge") + patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) + patientMerge13 = merge(patient1, patient3, by="merge") + patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) + patientMerge23 = merge(patient2, patient3, by="merge") + patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) + } scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge") scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns]) @@ -610,6 +842,7 @@ } } else { #scatterplot data scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] + scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge) if(sum(in_two) > 0){ scatterplot_locus_data[in_two,]$type = "In two" @@ -693,6 +926,9 @@ } if(nrow(triplets) != 0){ + + cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T) + triplets$uniqueID = "ID" triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" @@ -704,7 +940,9 @@ triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" - + + cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T) + triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4) triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4) min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")]) @@ -720,23 +958,12 @@ triplets = triplets[triplets$normalized_read_count >= min_cells,] - column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste") + column_drops = c("min_cell_count", "min_cell_paste") triplets = triplets[,!(colnames(triplets) %in% column_drops)] - - #remove duplicate V+J+CDR3, add together numerical values - triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor), - Cell_Count=unique(.SD$Cell_Count), - Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), - Total_Read_Count=sum(.SD$Total_Read_Count), - dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), - Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), - Frequency=sum(.SD$Frequency), - normalized_read_count=sum(.SD$normalized_read_count), - Log10_Frequency=sum(.SD$Log10_Frequency), - Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) - - + + cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) + interval = intervalReads intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) @@ -744,19 +971,20 @@ one = triplets[triplets$Sample == "14696_reg_BM",] two = triplets[triplets$Sample == "24536_reg_BM",] three = triplets[triplets$Sample == "24062_reg_BM",] - tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T) + #tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T) one = triplets[triplets$Sample == "16278_Left",] two = triplets[triplets$Sample == "26402_Left",] three = triplets[triplets$Sample == "26759_Left",] - tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T) + #tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T) one = triplets[triplets$Sample == "16278_Right",] two = triplets[triplets$Sample == "26402_Right",] three = triplets[triplets$Sample == "26759_Right",] tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T) - + cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) + interval = intervalFreq intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) @@ -764,12 +992,12 @@ one = triplets[triplets$Sample == "14696_reg_BM",] two = triplets[triplets$Sample == "24536_reg_BM",] three = triplets[triplets$Sample == "24062_reg_BM",] - tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F) + #tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F) one = triplets[triplets$Sample == "16278_Left",] two = triplets[triplets$Sample == "26402_Left",] three = triplets[triplets$Sample == "26759_Left",] - tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F) + #tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F) one = triplets[triplets$Sample == "16278_Right",] two = triplets[triplets$Sample == "26402_Right",] @@ -778,3 +1006,4 @@ } else { cat("", file="triplets.txt") } +cat("</table></html>", file=logfile, append=T) \ No newline at end of file