Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 13:576de7c96c4f draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 22 Jan 2015 07:12:13 -0500 |
parents | eb5b569b44dd |
children | 1735e91a8f4b |
comparison
equal
deleted
inserted
replaced
12:eb5b569b44dd | 13:576de7c96c4f |
---|---|
13 library(data.table) | 13 library(data.table) |
14 library(grid) | 14 library(grid) |
15 library(parallel) | 15 library(parallel) |
16 #require(xtable) | 16 #require(xtable) |
17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) | 18 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) |
19 dat = dat[!is.na(dat$Patient),] | 19 dat = dat[!is.na(dat$Patient),] |
20 dat = dat[!duplicated(dat$Clone_Sequence), ] | 20 dat$Related_to_leukemia_clone = F |
21 | 21 |
22 setwd(outDir) | 22 setwd(outDir) |
23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) |
25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) |
26 | 26 |
27 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
28 | |
27 dat$Frequency = ((10^dat$Log10_Frequency)*100) | 29 dat$Frequency = ((10^dat$Log10_Frequency)*100) |
28 | 30 |
29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) | |
30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
31 | |
32 most.common = function(x, ret="V"){ | |
33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") | |
34 ux = unique(past) | |
35 if(length(ux) > 1){ | |
36 xtdf = data.frame(table(past)) | |
37 #print(xtdf) | |
38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) | |
39 #print(res) | |
40 if(ret == "V"){ | |
41 return(res[1]) | |
42 } else { | |
43 return(res[2]) | |
44 } | |
45 } | |
46 | |
47 if(ret == "V"){ | |
48 return(unique(x$V_Segment_Major_Gene)) | |
49 } else { | |
50 return(unique(x$J_Segment_Major_Gene)) | |
51 } | |
52 } | |
53 | |
54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) | |
55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
56 | |
57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
58 | |
59 | |
60 dat = dat[dat$Frequency >= min_freq,] | 31 dat = dat[dat$Frequency >= min_freq,] |
61 | 32 |
62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) | 33 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] |
63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) | 34 |
64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes | 35 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T) |
36 | |
37 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4) | |
38 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4) | |
39 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")]) | |
40 | |
41 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) | |
42 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) | |
43 | |
44 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
45 | |
46 dat = merge(dat, min_cell_count, by="min_cell_paste") | |
47 | |
48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
49 | |
65 dat = dat[dat$normalized_read_count >= min_cells,] | 50 dat = dat[dat$normalized_read_count >= min_cells,] |
66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 51 |
67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] | 52 dat$paste = paste(dat$Sample, dat$Clone_Sequence) |
68 | 53 |
69 patients = split(dat, dat$Patient, drop=T) | 54 patients = split(dat, dat$Patient, drop=T) |
70 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) | 55 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) |
71 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) | 56 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) |
72 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 57 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
116 } | 101 } |
117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 102 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
118 | 103 |
119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 104 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 105 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
121 patient1$merge = paste(patient1$CDR3_Sense_Sequence) | 106 patient1$merge = paste(patient1$Clone_Sequence) |
122 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | 107 patient2$merge = paste(patient2$Clone_Sequence) |
123 | 108 |
124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 109 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 110 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
126 res1 = vector() | 111 res1 = vector() |
127 res2 = vector() | 112 res2 = vector() |
233 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") | 218 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") |
234 | 219 |
235 cat("</table></html>", file=logfile, append=T) | 220 cat("</table></html>", file=logfile, append=T) |
236 | 221 |
237 | 222 |
223 | |
238 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ | 224 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ |
239 onShort = "reads" | 225 onShort = "reads" |
240 if(on == "Frequency"){ | 226 if(on == "Frequency"){ |
241 onShort = "freq" | 227 onShort = "freq" |
242 } | 228 } |
260 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | 246 patient2$merge = paste(patient2$CDR3_Sense_Sequence) |
261 patient3$merge = paste(patient3$CDR3_Sense_Sequence) | 247 patient3$merge = paste(patient3$CDR3_Sense_Sequence) |
262 | 248 |
263 patientMerge = merge(patient1, patient2, by="merge") | 249 patientMerge = merge(patient1, patient2, by="merge") |
264 patientMerge = merge(patientMerge, patient3, by="merge") | 250 patientMerge = merge(patientMerge, patient3, by="merge") |
265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") | 251 colnames(patientMerge)[30:length(colnames(patientMerge))] = paste(colnames(patientMerge)[30:length(colnames(patientMerge))], ".z", sep="") |
266 res1 = vector() | 252 res1 = vector() |
267 res2 = vector() | 253 res2 = vector() |
268 res3 = vector() | 254 res3 = vector() |
269 resAll = vector() | 255 resAll = vector() |
270 read1Count = vector() | 256 read1Count = vector() |
292 resAll = append(resAll, sum(all)) | 278 resAll = append(resAll, sum(all)) |
293 #threshhold = 0 | 279 #threshhold = 0 |
294 if(threshhold != 0){ | 280 if(threshhold != 0){ |
295 if(sum(one) > 0){ | 281 if(sum(one) > 0){ |
296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 282 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 283 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") | 284 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") |
299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 285 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
300 } | 286 } |
301 if(sum(two) > 0){ | 287 if(sum(two) > 0){ |
302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 288 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 289 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") | 290 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") |
305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 291 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
306 } | 292 } |
307 if(sum(three) > 0){ | 293 if(sum(three) > 0){ |
308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 294 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") | 295 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | 296 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") |
311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 297 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
312 } | 298 } |
313 } | 299 } |
314 if(sum(all) > 0){ | 300 if(sum(all) > 0){ |
358 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) | 344 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) |
359 print(plt) | 345 print(plt) |
360 dev.off() | 346 dev.off() |
361 } | 347 } |
362 | 348 |
363 | |
364 triplets$uniqueID = "ID" | 349 triplets$uniqueID = "ID" |
365 | 350 |
366 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 351 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
367 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 352 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
368 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | 353 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" |
371 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | 356 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" |
372 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | 357 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" |
373 | 358 |
374 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" | 359 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" |
375 | 360 |
376 triplets = data.frame(data.table(triplets)[, list(Patient=unique(.SD$uniqueID), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 361 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4) |
377 | 362 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4) |
378 triplets$Frequency = (10^as.numeric(triplets$Log10_Frequency))*100 | 363 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")]) |
379 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * 1000000 / 2) | 364 |
365 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J) | |
366 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J) | |
367 | |
368 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
369 | |
370 triplets = merge(triplets, min_cell_count, by="min_cell_paste") | |
371 | |
372 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
373 | |
374 triplets = triplets[triplets$normalized_read_count >= min_cells,] | |
375 | |
376 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste") | |
377 | |
378 triplets = triplets[,!(colnames(triplets) %in% column_drops)] | |
380 | 379 |
381 interval = intervalReads | 380 interval = intervalReads |
382 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 381 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
383 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | 382 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
384 | 383 |