Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 13:576de7c96c4f draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 22 Jan 2015 07:12:13 -0500 |
| parents | eb5b569b44dd |
| children | 1735e91a8f4b |
comparison
equal
deleted
inserted
replaced
| 12:eb5b569b44dd | 13:576de7c96c4f |
|---|---|
| 13 library(data.table) | 13 library(data.table) |
| 14 library(grid) | 14 library(grid) |
| 15 library(parallel) | 15 library(parallel) |
| 16 #require(xtable) | 16 #require(xtable) |
| 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
| 18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) | 18 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) |
| 19 dat = dat[!is.na(dat$Patient),] | 19 dat = dat[!is.na(dat$Patient),] |
| 20 dat = dat[!duplicated(dat$Clone_Sequence), ] | 20 dat$Related_to_leukemia_clone = F |
| 21 | 21 |
| 22 setwd(outDir) | 22 setwd(outDir) |
| 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
| 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) |
| 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) |
| 26 | 26 |
| 27 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
| 28 | |
| 27 dat$Frequency = ((10^dat$Log10_Frequency)*100) | 29 dat$Frequency = ((10^dat$Log10_Frequency)*100) |
| 28 | 30 |
| 29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) | |
| 30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
| 31 | |
| 32 most.common = function(x, ret="V"){ | |
| 33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") | |
| 34 ux = unique(past) | |
| 35 if(length(ux) > 1){ | |
| 36 xtdf = data.frame(table(past)) | |
| 37 #print(xtdf) | |
| 38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) | |
| 39 #print(res) | |
| 40 if(ret == "V"){ | |
| 41 return(res[1]) | |
| 42 } else { | |
| 43 return(res[2]) | |
| 44 } | |
| 45 } | |
| 46 | |
| 47 if(ret == "V"){ | |
| 48 return(unique(x$V_Segment_Major_Gene)) | |
| 49 } else { | |
| 50 return(unique(x$J_Segment_Major_Gene)) | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) | |
| 55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
| 56 | |
| 57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
| 58 | |
| 59 | |
| 60 dat = dat[dat$Frequency >= min_freq,] | 31 dat = dat[dat$Frequency >= min_freq,] |
| 61 | 32 |
| 62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) | 33 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] |
| 63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) | 34 |
| 64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes | 35 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T) |
| 36 | |
| 37 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4) | |
| 38 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4) | |
| 39 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")]) | |
| 40 | |
| 41 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) | |
| 42 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) | |
| 43 | |
| 44 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
| 45 | |
| 46 dat = merge(dat, min_cell_count, by="min_cell_paste") | |
| 47 | |
| 48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
| 49 | |
| 65 dat = dat[dat$normalized_read_count >= min_cells,] | 50 dat = dat[dat$normalized_read_count >= min_cells,] |
| 66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 51 |
| 67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] | 52 dat$paste = paste(dat$Sample, dat$Clone_Sequence) |
| 68 | 53 |
| 69 patients = split(dat, dat$Patient, drop=T) | 54 patients = split(dat, dat$Patient, drop=T) |
| 70 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) | 55 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) |
| 71 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) | 56 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) |
| 72 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 57 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
| 116 } | 101 } |
| 117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 102 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
| 118 | 103 |
| 119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 104 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
| 120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 105 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
| 121 patient1$merge = paste(patient1$CDR3_Sense_Sequence) | 106 patient1$merge = paste(patient1$Clone_Sequence) |
| 122 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | 107 patient2$merge = paste(patient2$Clone_Sequence) |
| 123 | 108 |
| 124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 109 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
| 125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 110 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
| 126 res1 = vector() | 111 res1 = vector() |
| 127 res2 = vector() | 112 res2 = vector() |
| 233 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") | 218 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") |
| 234 | 219 |
| 235 cat("</table></html>", file=logfile, append=T) | 220 cat("</table></html>", file=logfile, append=T) |
| 236 | 221 |
| 237 | 222 |
| 223 | |
| 238 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ | 224 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ |
| 239 onShort = "reads" | 225 onShort = "reads" |
| 240 if(on == "Frequency"){ | 226 if(on == "Frequency"){ |
| 241 onShort = "freq" | 227 onShort = "freq" |
| 242 } | 228 } |
| 260 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | 246 patient2$merge = paste(patient2$CDR3_Sense_Sequence) |
| 261 patient3$merge = paste(patient3$CDR3_Sense_Sequence) | 247 patient3$merge = paste(patient3$CDR3_Sense_Sequence) |
| 262 | 248 |
| 263 patientMerge = merge(patient1, patient2, by="merge") | 249 patientMerge = merge(patient1, patient2, by="merge") |
| 264 patientMerge = merge(patientMerge, patient3, by="merge") | 250 patientMerge = merge(patientMerge, patient3, by="merge") |
| 265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") | 251 colnames(patientMerge)[30:length(colnames(patientMerge))] = paste(colnames(patientMerge)[30:length(colnames(patientMerge))], ".z", sep="") |
| 266 res1 = vector() | 252 res1 = vector() |
| 267 res2 = vector() | 253 res2 = vector() |
| 268 res3 = vector() | 254 res3 = vector() |
| 269 resAll = vector() | 255 resAll = vector() |
| 270 read1Count = vector() | 256 read1Count = vector() |
| 292 resAll = append(resAll, sum(all)) | 278 resAll = append(resAll, sum(all)) |
| 293 #threshhold = 0 | 279 #threshhold = 0 |
| 294 if(threshhold != 0){ | 280 if(threshhold != 0){ |
| 295 if(sum(one) > 0){ | 281 if(sum(one) > 0){ |
| 296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 282 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 283 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
| 298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") | 284 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 285 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 300 } | 286 } |
| 301 if(sum(two) > 0){ | 287 if(sum(two) > 0){ |
| 302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 288 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 289 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
| 304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") | 290 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 291 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 306 } | 292 } |
| 307 if(sum(three) > 0){ | 293 if(sum(three) > 0){ |
| 308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 294 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 295 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
| 310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | 296 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 297 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 312 } | 298 } |
| 313 } | 299 } |
| 314 if(sum(all) > 0){ | 300 if(sum(all) > 0){ |
| 358 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) | 344 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) |
| 359 print(plt) | 345 print(plt) |
| 360 dev.off() | 346 dev.off() |
| 361 } | 347 } |
| 362 | 348 |
| 363 | |
| 364 triplets$uniqueID = "ID" | 349 triplets$uniqueID = "ID" |
| 365 | 350 |
| 366 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 351 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
| 367 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 352 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
| 368 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 353 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
| 371 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | 356 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" |
| 372 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | 357 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" |
| 373 | 358 |
| 374 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" | 359 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" |
| 375 | 360 |
| 376 triplets = data.frame(data.table(triplets)[, list(Patient=unique(.SD$uniqueID), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 361 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4) |
| 377 | 362 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4) |
| 378 triplets$Frequency = (10^as.numeric(triplets$Log10_Frequency))*100 | 363 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")]) |
| 379 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * 1000000 / 2) | 364 |
| 365 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J) | |
| 366 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J) | |
| 367 | |
| 368 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
| 369 | |
| 370 triplets = merge(triplets, min_cell_count, by="min_cell_paste") | |
| 371 | |
| 372 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
| 373 | |
| 374 triplets = triplets[triplets$normalized_read_count >= min_cells,] | |
| 375 | |
| 376 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste") | |
| 377 | |
| 378 triplets = triplets[,!(colnames(triplets) %in% column_drops)] | |
| 380 | 379 |
| 381 interval = intervalReads | 380 interval = intervalReads |
| 382 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 381 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
| 383 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | 382 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
| 384 | 383 |
