comparison RScript.r @ 12:eb5b569b44dd draft

Uploaded
author davidvanzessen
date Wed, 10 Dec 2014 09:49:16 -0500
parents bc4612998d50
children 576de7c96c4f
comparison
equal deleted inserted replaced
11:bc4612998d50 12:eb5b569b44dd
15 library(parallel) 15 library(parallel)
16 #require(xtable) 16 #require(xtable)
17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) 18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F)
19 dat = dat[!is.na(dat$Patient),] 19 dat = dat[!is.na(dat$Patient),]
20 dat = dat[!duplicated(dat$Clone_Sequence), ]
20 21
21 setwd(outDir) 22 setwd(outDir)
22 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
23 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
24 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
25 26
26 str(dat) 27 dat$Frequency = ((10^dat$Log10_Frequency)*100)
28
27 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) 29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T)
28 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) 30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
29 31
30 most.common = function(x){ 32 most.common = function(x, ret="V"){
31 ux = unique(x) 33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";")
34 ux = unique(past)
32 if(length(ux) > 1){ 35 if(length(ux) > 1){
33 xtdf = data.frame(table(x)) 36 xtdf = data.frame(table(past))
34 return(xtdf$Var1[which.max(xtdf$Freq)])
35 #print(xtdf) 37 #print(xtdf)
36 } 38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";"))
37 return(unique(x)) 39 #print(res)
40 if(ret == "V"){
41 return(res[1])
42 } else {
43 return(res[2])
44 }
45 }
46
47 if(ret == "V"){
48 return(unique(x$V_Segment_Major_Gene))
49 } else {
50 return(unique(x$J_Segment_Major_Gene))
51 }
38 } 52 }
39 53
40 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene=most.common(.SD$V_Segment_Major_Gene), J_Segment_Major_Gene=most.common(.SD$J_Segment_Major_Gene), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) 54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")])
55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
41 56
42 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) 57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
43 dat$Frequency = ((10^dat$Log10_Frequency)*100) 58
44 59
45 dat = dat[dat$Frequency >= min_freq,] 60 dat = dat[dat$Frequency >= min_freq,]
46 61
47 cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) 62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T)
48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) 63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes
49 dat = dat[dat$normalized_read_count >= min_cells,] 65 dat = dat[dat$normalized_read_count >= min_cells,]
50 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) 66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
51 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] 67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
52 68
53 patients = split(dat, dat$Patient, drop=T) 69 patients = split(dat, dat$Patient, drop=T)
98 if(appendtxt){ 114 if(appendtxt){
99 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) 115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
100 } 116 }
101 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) 117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
102 118
103 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) 119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
104 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) 120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
105 121 patient1$merge = paste(patient1$CDR3_Sense_Sequence)
122 patient2$merge = paste(patient2$CDR3_Sense_Sequence)
123
124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
106 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") 125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
107 res1 = vector() 126 res1 = vector()
108 res2 = vector() 127 res2 = vector()
109 resBoth = vector() 128 resBoth = vector()
110 read1Count = vector() 129 read1Count = vector()
231 patientIndex = which(colnames(patient1) == "Patient") 250 patientIndex = which(colnames(patient1) == "Patient")
232 oneSample = paste(patient1[1,sampleIndex], sep="") 251 oneSample = paste(patient1[1,sampleIndex], sep="")
233 twoSample = paste(patient2[1,sampleIndex], sep="") 252 twoSample = paste(patient2[1,sampleIndex], sep="")
234 threeSample = paste(patient3[1,sampleIndex], sep="") 253 threeSample = paste(patient3[1,sampleIndex], sep="")
235 254
236 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) 255 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
237 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) 256 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
238 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) 257 #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
258
259 patient1$merge = paste(patient1$CDR3_Sense_Sequence)
260 patient2$merge = paste(patient2$CDR3_Sense_Sequence)
261 patient3$merge = paste(patient3$CDR3_Sense_Sequence)
239 262
240 patientMerge = merge(patient1, patient2, by="merge") 263 patientMerge = merge(patient1, patient2, by="merge")
241 patientMerge = merge(patientMerge, patient3, by="merge") 264 patientMerge = merge(patientMerge, patient3, by="merge")
242 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") 265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="")
243 res1 = vector() 266 res1 = vector()
269 resAll = append(resAll, sum(all)) 292 resAll = append(resAll, sum(all))
270 #threshhold = 0 293 #threshhold = 0
271 if(threshhold != 0){ 294 if(threshhold != 0){
272 if(sum(one) > 0){ 295 if(sum(one) > 0){
273 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
274 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") 297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
275 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") 298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
276 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
277 } 300 }
278 if(sum(two) > 0){ 301 if(sum(two) > 0){
279 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
280 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") 303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
281 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") 304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
282 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
283 } 306 }
284 if(sum(three) > 0){ 307 if(sum(three) > 0){
285 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] 308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
286 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") 309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
287 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") 310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
288 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
289 } 312 }
290 } 313 }
291 if(sum(all) > 0){ 314 if(sum(all) > 0){