comparison RScript.r @ 13:576de7c96c4f draft

Uploaded
author davidvanzessen
date Thu, 22 Jan 2015 07:12:13 -0500
parents eb5b569b44dd
children 1735e91a8f4b
comparison
equal deleted inserted replaced
12:eb5b569b44dd 13:576de7c96c4f
13 library(data.table) 13 library(data.table)
14 library(grid) 14 library(grid)
15 library(parallel) 15 library(parallel)
16 #require(xtable) 16 #require(xtable)
17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) 18 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
19 dat = dat[!is.na(dat$Patient),] 19 dat = dat[!is.na(dat$Patient),]
20 dat = dat[!duplicated(dat$Clone_Sequence), ] 20 dat$Related_to_leukemia_clone = F
21 21
22 setwd(outDir) 22 setwd(outDir)
23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
26 26
27 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
28
27 dat$Frequency = ((10^dat$Log10_Frequency)*100) 29 dat$Frequency = ((10^dat$Log10_Frequency)*100)
28 30
29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T)
30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
31
32 most.common = function(x, ret="V"){
33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";")
34 ux = unique(past)
35 if(length(ux) > 1){
36 xtdf = data.frame(table(past))
37 #print(xtdf)
38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";"))
39 #print(res)
40 if(ret == "V"){
41 return(res[1])
42 } else {
43 return(res[2])
44 }
45 }
46
47 if(ret == "V"){
48 return(unique(x$V_Segment_Major_Gene))
49 } else {
50 return(unique(x$J_Segment_Major_Gene))
51 }
52 }
53
54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")])
55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
56
57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
58
59
60 dat = dat[dat$Frequency >= min_freq,] 31 dat = dat[dat$Frequency >= min_freq,]
61 32
62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) 33 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) 34
64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes 35 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
36
37 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
38 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
39 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
40
41 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
42 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
43
44 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
45
46 dat = merge(dat, min_cell_count, by="min_cell_paste")
47
48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
49
65 dat = dat[dat$normalized_read_count >= min_cells,] 50 dat = dat[dat$normalized_read_count >= min_cells,]
66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) 51
67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] 52 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
68 53
69 patients = split(dat, dat$Patient, drop=T) 54 patients = split(dat, dat$Patient, drop=T)
70 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) 55 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
71 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) 56 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
72 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") 57 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
116 } 101 }
117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) 102 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
118 103
119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) 104 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) 105 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
121 patient1$merge = paste(patient1$CDR3_Sense_Sequence) 106 patient1$merge = paste(patient1$Clone_Sequence)
122 patient2$merge = paste(patient2$CDR3_Sense_Sequence) 107 patient2$merge = paste(patient2$Clone_Sequence)
123 108
124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") 109 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") 110 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
126 res1 = vector() 111 res1 = vector()
127 res2 = vector() 112 res2 = vector()
233 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") 218 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
234 219
235 cat("</table></html>", file=logfile, append=T) 220 cat("</table></html>", file=logfile, append=T)
236 221
237 222
223
238 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ 224 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
239 onShort = "reads" 225 onShort = "reads"
240 if(on == "Frequency"){ 226 if(on == "Frequency"){
241 onShort = "freq" 227 onShort = "freq"
242 } 228 }
260 patient2$merge = paste(patient2$CDR3_Sense_Sequence) 246 patient2$merge = paste(patient2$CDR3_Sense_Sequence)
261 patient3$merge = paste(patient3$CDR3_Sense_Sequence) 247 patient3$merge = paste(patient3$CDR3_Sense_Sequence)
262 248
263 patientMerge = merge(patient1, patient2, by="merge") 249 patientMerge = merge(patient1, patient2, by="merge")
264 patientMerge = merge(patientMerge, patient3, by="merge") 250 patientMerge = merge(patientMerge, patient3, by="merge")
265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") 251 colnames(patientMerge)[30:length(colnames(patientMerge))] = paste(colnames(patientMerge)[30:length(colnames(patientMerge))], ".z", sep="")
266 res1 = vector() 252 res1 = vector()
267 res2 = vector() 253 res2 = vector()
268 res3 = vector() 254 res3 = vector()
269 resAll = vector() 255 resAll = vector()
270 read1Count = vector() 256 read1Count = vector()
292 resAll = append(resAll, sum(all)) 278 resAll = append(resAll, sum(all))
293 #threshhold = 0 279 #threshhold = 0
294 if(threshhold != 0){ 280 if(threshhold != 0){
295 if(sum(one) > 0){ 281 if(sum(one) > 0){
296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 282 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") 283 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") 284 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 285 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
300 } 286 }
301 if(sum(two) > 0){ 287 if(sum(two) > 0){
302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 288 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") 289 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") 290 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 291 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
306 } 292 }
307 if(sum(three) > 0){ 293 if(sum(three) > 0){
308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 294 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") 295 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") 296 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 297 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
312 } 298 }
313 } 299 }
314 if(sum(all) > 0){ 300 if(sum(all) > 0){
358 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) 344 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
359 print(plt) 345 print(plt)
360 dev.off() 346 dev.off()
361 } 347 }
362 348
363
364 triplets$uniqueID = "ID" 349 triplets$uniqueID = "ID"
365 350
366 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" 351 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
367 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" 352 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
368 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" 353 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
371 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" 356 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
372 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" 357 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
373 358
374 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" 359 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
375 360
376 triplets = data.frame(data.table(triplets)[, list(Patient=unique(.SD$uniqueID), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) 361 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
377 362 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
378 triplets$Frequency = (10^as.numeric(triplets$Log10_Frequency))*100 363 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
379 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * 1000000 / 2) 364
365 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
366 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
367
368 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
369
370 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
371
372 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
373
374 triplets = triplets[triplets$normalized_read_count >= min_cells,]
375
376 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
377
378 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
380 379
381 interval = intervalReads 380 interval = intervalReads
382 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 381 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
383 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) 382 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
384 383