comparison RScript.r @ 7:68c6c7624ffc draft

Uploaded
author davidvanzessen
date Thu, 18 Sep 2014 08:56:44 -0400
parents 8313c6cc65c5
children fa240d1c57a9
comparison
equal deleted inserted replaced
6:8313c6cc65c5 7:68c6c7624ffc
32 dat = dat[dat$normalized_read_count >= min_cells,] 32 dat = dat[dat$normalized_read_count >= min_cells,]
33 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) 33 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
34 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) 34 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T)
35 dat = dat[!duplicated(dat$paste),] 35 dat = dat[!duplicated(dat$paste),]
36 patients = split(dat, dat$Patient, drop=T) 36 patients = split(dat, dat$Patient, drop=T)
37 rm(dat)
38 intervalReads = rev(c(0,10,25,50,100,1000,10000)) 37 intervalReads = rev(c(0,10,25,50,100,1000,10000))
39 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) 38 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
40 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") 39 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
41 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") 40 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
42 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") 41 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
89 resBoth = vector() 88 resBoth = vector()
90 read1Count = vector() 89 read1Count = vector()
91 read2Count = vector() 90 read2Count = vector()
92 locussum1 = vector() 91 locussum1 = vector()
93 locussum2 = vector() 92 locussum2 = vector()
94
95 print(patient)
96 #for(iter in 1){ 93 #for(iter in 1){
97 for(iter in 1:length(product[,1])){ 94 for(iter in 1:length(product[,1])){
98 threshhold = product[iter,threshholdIndex] 95 threshhold = product[iter,threshholdIndex]
99 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="") 96 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
100 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="") 97 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
182 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) 179 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
183 180
184 interval = intervalFreq 181 interval = intervalFreq
185 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 182 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
186 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) 183 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
187 #patientFrequencyCount(patient1)
188 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount)
189 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
190 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
191 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
192 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 184 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
193 185
194 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) 186 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
195 187
196 interval = intervalReads 188 interval = intervalReads
197 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 189 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
198 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) 190 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
199 #patientResult = patientReadCount(patient1)
200 #lapply(patients[c(5,6,10)], FUN=patientReadCount)
201 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
202 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
203 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 191 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
204 192
205 cat("</table></html>", file=logfile, append=T) 193 cat("</table></html>", file=logfile, append=T)
206 194
195
196 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
197 onShort = "reads"
198 if(on == "Frequency"){
199 onShort = "freq"
200 }
201 type="triplet"
202
203 threshholdIndex = which(colnames(product) == "interval")
204 V_SegmentIndex = which(colnames(product) == "V_Segments")
205 J_SegmentIndex = which(colnames(product) == "J_Segments")
206 titleIndex = which(colnames(product) == "Titles")
207 sampleIndex = which(colnames(patient1) == "Sample")
208 patientIndex = which(colnames(patient1) == "Patient")
209 oneSample = paste(patient1[1,sampleIndex], sep="")
210 twoSample = paste(patient2[1,sampleIndex], sep="")
211 threeSample = paste(patient3[1,sampleIndex], sep="")
212
213 patientMerge = merge(patient1, patient2, by="Clone_Sequence")
214 patientMerge = merge(patientMerge, patient3, by="Clone_Sequence")
215 colnames(patientMerge)[32:length(colnames(patientMerge))] = paste(colnames(patientMerge)[32:length(colnames(patientMerge))], ".z", sep="")
216 res1 = vector()
217 res2 = vector()
218 res3 = vector()
219 resAll = vector()
220 read1Count = vector()
221 read2Count = vector()
222 read3Count = vector()
223
224 if(appendTriplets){
225 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
226 }
227 for(iter in 1:length(product[,1])){
228 threshhold = product[iter,threshholdIndex]
229 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
230 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
231 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,paste(on, ".x", sep="")] > threshhold & patientMerge[,paste(on, ".y", sep="")] > threshhold & patientMerge[,paste(on, ".z", sep="")] > threshhold)
232 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$Clone_Sequence %in% patientMerge[all,]$Clone_Sequence))
233 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$Clone_Sequence %in% patientMerge[all,]$Clone_Sequence))
234 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$Clone_Sequence %in% patientMerge[all,]$Clone_Sequence))
235
236 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
237 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
238 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
239 res1 = append(res1, sum(one))
240 res2 = append(res2, sum(two))
241 res3 = append(res3, sum(three))
242 resAll = append(resAll, sum(all))
243 #threshhold = 0
244 if(threshhold != 0){
245 if(sum(one) > 0){
246 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
247 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
248 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
249 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
250 }
251 if(sum(two) > 0){
252 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
253 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
254 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
255 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
256 }
257 if(sum(three) > 0){
258 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
259 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
260 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
261 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
262 }
263 }
264 if(sum(all) > 0){
265 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
266 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
267 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
268 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
269 }
270 }
271 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
272 colnames(patientResult)[6] = oneSample
273 colnames(patientResult)[8] = twoSample
274 colnames(patientResult)[10] = threeSample
275
276 colnamesBak = colnames(patientResult)
277 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample), paste("Normalized Read Count", twoSample), paste("Number of sequences", threeSample), paste("Normalized Read Count", threeSample))
278 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
279 colnames(patientResult) = colnamesBak
280
281 patientResult$Locus = factor(patientResult$Locus, Titles)
282 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
283
284 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
285 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
286 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
287 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
288 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
289 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
290 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
291 print(plt)
292 dev.off()
293
294 fontSize = 4
295
296 bak = patientResult
297 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
298 patientResult$relativeValue = patientResult$value * 10
299 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
300 plt = ggplot(patientResult)
301 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
302 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
303 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
304 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
305 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
306 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
307 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
308 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
309 print(plt)
310 dev.off()
311 }
312
313 interval = intervalReads
314 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
315 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
316
317 one = dat[dat$Patient == "VanDongen_cALL_14696.1",]
318 two = dat[dat$Patient == "VanDongen_cALL_14696.2",]
319 three = dat[dat$Patient == "VanDongen_cALL_14696.3",]
320 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count")
321
322 one = dat[dat$Sample == "16278_Left",]
323 two = dat[dat$Sample == "26402_Left",]
324 three = dat[dat$Sample == "26759_Left",]
325 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count")
326
327 one = dat[dat$Sample == "16278_Right",]
328 two = dat[dat$Sample == "26402_Right",]
329 three = dat[dat$Sample == "26759_Right",]
330 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count")
331
332
333 interval = intervalFreq
334 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
335 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
336
337 one = dat[dat$Patient == "VanDongen_cALL_14696.1",]
338 two = dat[dat$Patient == "VanDongen_cALL_14696.2",]
339 three = dat[dat$Patient == "VanDongen_cALL_14696.3",]
340 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", T)
341
342 one = dat[dat$Sample == "16278_Left",]
343 two = dat[dat$Sample == "26402_Left",]
344 three = dat[dat$Sample == "26759_Left",]
345 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", T)
346
347 one = dat[dat$Sample == "16278_Right",]
348 two = dat[dat$Sample == "26402_Right",]
349 three = dat[dat$Sample == "26759_Right",]
350 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", T)
351
352
353