comparison RScript.txt @ 60:28c3695259c1 draft

Uploaded
author davidvanzessen
date Thu, 29 Oct 2015 11:21:33 -0400
parents
children
comparison
equal deleted inserted replaced
59:36e307208f1b 60:28c3695259c1
1 args <- commandArgs(trailingOnly = TRUE)
2
3 inFile = args[1]
4 outDir = args[2]
5 logfile = args[3]
6 min_freq = as.numeric(args[4])
7 min_cells = as.numeric(args[5])
8 mergeOn = args[6]
9
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
11
12 library(ggplot2)
13 library(reshape2)
14 library(data.table)
15 library(grid)
16 library(parallel)
17 #require(xtable)
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
21 dat$dsPerM = 0
22 dat = dat[!is.na(dat$Patient),]
23 dat$Related_to_leukemia_clone = F
24
25 setwd(outDir)
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
29
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
31
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
33
34 dat = dat[dat$Frequency >= min_freq,]
35
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
37
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
39
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
43
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
46
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
48 print(paste("rows:", nrow(dat)))
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
50 print(paste("rows:", nrow(dat)))
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
52
53 dat = dat[dat$normalized_read_count >= min_cells,]
54
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
56
57 patients = split(dat, dat$Patient, drop=T)
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
63 Titles = factor(Titles, levels=Titles)
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
65
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
67
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
69 patient.merge.list.second = list()
70 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
71 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
72 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
73 if (!is.data.frame(x) & is.list(x)){
74 x = x[[1]]
75 }
76 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
77 x = data.frame(x,stringsAsFactors=F)
78 onShort = "reads"
79 if(on == "Frequency"){
80 onShort = "freq"
81 }
82 onx = paste(on, ".x", sep="")
83 ony = paste(on, ".y", sep="")
84 splt = split(x, x$Sample, drop=T)
85 type="pair"
86 if(length(splt) == 1){
87 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
88 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
89 type="single"
90 }
91 patient1 = splt[[1]]
92 patient2 = splt[[2]]
93
94 threshholdIndex = which(colnames(product) == "interval")
95 V_SegmentIndex = which(colnames(product) == "V_Segments")
96 J_SegmentIndex = which(colnames(product) == "J_Segments")
97 titleIndex = which(colnames(product) == "Titles")
98 sampleIndex = which(colnames(x) == "Sample")
99 patientIndex = which(colnames(x) == "Patient")
100 oneSample = paste(patient1[1,sampleIndex], sep="")
101 twoSample = paste(patient2[1,sampleIndex], sep="")
102 patient = paste(x[1,patientIndex])
103
104 switched = F
105 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
106 tmp = twoSample
107 twoSample = oneSample
108 oneSample = tmp
109 tmp = patient1
110 patient1 = patient2
111 patient2 = tmp
112 switched = T
113 }
114 if(appendtxt){
115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
116 }
117 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
118
119 if(mergeOn == "Clone_Sequence"){
120 patient1$merge = paste(patient1$Clone_Sequence)
121 patient2$merge = paste(patient2$Clone_Sequence)
122 } else {
123 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
124 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
125 }
126
127 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
128 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
129 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
130 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
131 scatterplot_data$on = onShort
132
133 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
134 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
135
136 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
137
138 start.time = proc.time()
139 merge.list = c()
140
141 if(patient %in% names(patient.merge.list)){
142 patientMerge = patient.merge.list[[patient]]
143 merge.list[["second"]] = patient.merge.list.second[[patient]]
144 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
145
146 print(names(patient.merge.list))
147 } else {
148 #fuzzy matching here...
149 #merge.list = patientMerge$merge
150
151 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
152 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
153
154 patient1.fuzzy = patient1
155 patient2.fuzzy = patient2
156
157 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
158 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
159
160 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
161 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
162
163 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
164 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
165
166 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
167 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
168
169 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
170 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
171
172 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
173 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
174
175 merge.list = list()
176
177 merge.list[["second"]] = vector()
178
179 while(nrow(patient.fuzzy) > 1){
180 first.merge = patient.fuzzy[1,"merge"]
181 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
182 first.sample = patient.fuzzy[1,"Sample"]
183 merge.filter = first.merge == patient.fuzzy$merge
184
185 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
186
187 first.sample.filter = first.sample == patient.fuzzy$Sample
188 second.sample.filter = first.sample != patient.fuzzy$Sample
189
190 #first match same sample, sum to a single row, same for other sample
191 #then merge rows like 'normal'
192
193 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
194
195
196
197 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
198 first.match.filter = merge.filter & sequence.filter & first.sample.filter
199 second.match.filter = merge.filter & sequence.filter & second.sample.filter
200
201 first.rows = patient.fuzzy[first.match.filter,]
202 second.rows = patient.fuzzy[second.match.filter,]
203
204 first.rows.v = table(first.rows$V_Segment_Major_Gene)
205 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
206 first.rows.j = table(first.rows$J_Segment_Major_Gene)
207 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
208
209 first.sum = data.frame(merge = first.clone.sequence,
210 Patient = patient,
211 Receptor = first.rows[1,"Receptor"],
212 Sample = first.rows[1,"Sample"],
213 Cell_Count = first.rows[1,"Cell_Count"],
214 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
215 Log10_Frequency = log10(sum(first.rows$Frequency)),
216 Total_Read_Count = sum(first.rows$Total_Read_Count),
217 dsPerM = sum(first.rows$dsPerM),
218 J_Segment_Major_Gene = first.rows.j,
219 V_Segment_Major_Gene = first.rows.v,
220 Clone_Sequence = first.clone.sequence,
221 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
222 Related_to_leukemia_clone = F,
223 Frequency = sum(first.rows$Frequency),
224 locus_V = first.rows[1,"locus_V"],
225 locus_J = first.rows[1,"locus_J"],
226 min_cell_count = first.rows[1,"min_cell_count"],
227 normalized_read_count = sum(first.rows$normalized_read_count),
228 paste = first.rows[1,"paste"],
229 min_cell_paste = first.rows[1,"min_cell_paste"])
230
231 if(nrow(second.rows) > 0){
232 second.rows.v = table(second.rows$V_Segment_Major_Gene)
233 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
234 second.rows.j = table(second.rows$J_Segment_Major_Gene)
235 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
236
237 second.sum = data.frame(merge = first.clone.sequence,
238 Patient = patient,
239 Receptor = second.rows[1,"Receptor"],
240 Sample = second.rows[1,"Sample"],
241 Cell_Count = second.rows[1,"Cell_Count"],
242 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
243 Log10_Frequency = log10(sum(second.rows$Frequency)),
244 Total_Read_Count = sum(second.rows$Total_Read_Count),
245 dsPerM = sum(second.rows$dsPerM),
246 J_Segment_Major_Gene = second.rows.j,
247 V_Segment_Major_Gene = second.rows.v,
248 Clone_Sequence = first.clone.sequence,
249 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
250 Related_to_leukemia_clone = F,
251 Frequency = sum(second.rows$Frequency),
252 locus_V = second.rows[1,"locus_V"],
253 locus_J = second.rows[1,"locus_J"],
254 min_cell_count = second.rows[1,"min_cell_count"],
255 normalized_read_count = sum(second.rows$normalized_read_count),
256 paste = second.rows[1,"paste"],
257 min_cell_paste = second.rows[1,"min_cell_paste"])
258
259 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
260 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
261
262 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
263 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
264
265 tmp.rows = rbind(first.rows, second.rows)
266 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
267
268 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
269 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
270 } else {
271 second.clone.sequence = second.rows[1,"Clone_Sequence"]
272 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
273 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
274 } else {
275 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
276 }
277 }
278
279 } else {
280 patient.fuzzy = patient.fuzzy[-1,]
281 }
282 }
283 patient.merge.list[[patient]] <<- patientMerge
284 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
285 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
286 }
287
288 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
289 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
290
291
292 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
293 res1 = vector()
294 res2 = vector()
295 resBoth = vector()
296 read1Count = vector()
297 read2Count = vector()
298 locussum1 = vector()
299 locussum2 = vector()
300
301 #for(iter in 1){
302 for(iter in 1:length(product[,1])){
303 threshhold = product[iter,threshholdIndex]
304 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
305 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
306 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
307 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
308 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
309 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
310 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
311 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
312 res1 = append(res1, sum(one))
313 res2 = append(res2, sum(two))
314 resBoth = append(resBoth, sum(both))
315 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
316 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
317 #threshhold = 0
318 if(threshhold != 0){
319 if(sum(one) > 0){
320 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
321 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
322 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
323 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
324 }
325 if(sum(two) > 0){
326 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
327 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
328 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
329 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
330 }
331 } else {
332 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
333 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
334 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
335 if(nrow(scatterplot_locus_data) > 0){
336 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
337 }
338 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
339 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
340 if(any(in_two)){
341 scatterplot_locus_data[in_two,]$type = twoSample
342 }
343 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
344 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
345 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
346 if(any(in_both)){
347 scatterplot_locus_data[in_both,]$type = "In Both"
348 }
349 if(type == "single"){
350 single_patients <<- rbind(single_patients, scatterplot_locus_data)
351 }
352 p = NULL
353 if(nrow(scatterplot_locus_data) != 0){
354 if(on == "normalized_read_count"){
355 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
356 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
357 } else {
358 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
359 }
360 p = p + geom_point(aes(colour=type), position="jitter")
361 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
362 } else {
363 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
364 }
365 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
366 print(p)
367 dev.off()
368 }
369 if(sum(both) > 0){
370 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
371 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
372 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
373 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
374 }
375 }
376 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
377 if(sum(is.na(patientResult$percentage)) > 0){
378 patientResult[is.na(patientResult$percentage),]$percentage = 0
379 }
380 colnames(patientResult)[6] = oneSample
381 colnames(patientResult)[8] = twoSample
382 colnamesBak = colnames(patientResult)
383 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
384 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
385 colnames(patientResult) = colnamesBak
386
387 patientResult$Locus = factor(patientResult$Locus, Titles)
388 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
389
390 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
391 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
392 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
393 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
394 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
395 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
396 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
397 print(plt)
398 dev.off()
399 #(t,r,b,l)
400 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
401 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
402 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
403 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
404 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
405 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
406 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
407 print(plt)
408 dev.off()
409
410 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
411 patientResult$relativeValue = patientResult$value * 10
412 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
413 plt = ggplot(patientResult)
414 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
415 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
416 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
417 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
418 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
419 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
420 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
421 print(plt)
422 dev.off()
423 }
424
425 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
426
427 interval = intervalFreq
428 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
429 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
430 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
431
432 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
433
434 interval = intervalReads
435 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
436 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
437 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
438
439 cat("</table></html>", file=logfile, append=T)
440
441
442 if(nrow(single_patients) > 0){
443 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
444 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
445 p = p + geom_point(aes(colour=type), position="jitter")
446 p = p + xlab("In one or both samples") + ylab("Reads")
447 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
448 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
449 print(p)
450 dev.off()
451
452 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
453 p = p + geom_point(aes(colour=type), position="jitter")
454 p = p + xlab("In one or both samples") + ylab("Frequency")
455 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
456 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
457 print(p)
458 dev.off()
459 } else {
460 empty <- data.frame()
461 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
462
463 png("singles_reads_scatterplot.png", width=400, height=300)
464 print(p)
465 dev.off()
466
467 png("singles_freq_scatterplot.png", width=400, height=300)
468 print(p)
469 dev.off()
470 }
471 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
472 onShort = "reads"
473 if(on == "Frequency"){
474 onShort = "freq"
475 }
476 onx = paste(on, ".x", sep="")
477 ony = paste(on, ".y", sep="")
478 onz = paste(on, ".z", sep="")
479 type="triplet"
480
481 threshholdIndex = which(colnames(product) == "interval")
482 V_SegmentIndex = which(colnames(product) == "V_Segments")
483 J_SegmentIndex = which(colnames(product) == "J_Segments")
484 titleIndex = which(colnames(product) == "Titles")
485 sampleIndex = which(colnames(patient1) == "Sample")
486 patientIndex = which(colnames(patient1) == "Patient")
487 oneSample = paste(patient1[1,sampleIndex], sep="")
488 twoSample = paste(patient2[1,sampleIndex], sep="")
489 threeSample = paste(patient3[1,sampleIndex], sep="")
490
491 if(mergeOn == "Clone_Sequence"){
492 patient1$merge = paste(patient1$Clone_Sequence)
493 patient2$merge = paste(patient2$Clone_Sequence)
494 patient3$merge = paste(patient3$Clone_Sequence)
495
496 } else {
497 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
498 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
499 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
500 }
501
502 patientMerge = merge(patient1, patient2, by="merge")
503 patientMerge = merge(patientMerge, patient3, by="merge")
504 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
505 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
506 patientMerge12 = merge(patient1, patient2, by="merge")
507 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
508 patientMerge13 = merge(patient1, patient3, by="merge")
509 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
510 patientMerge23 = merge(patient2, patient3, by="merge")
511 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
512
513
514 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
515 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
516 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
517 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
518
519 res1 = vector()
520 res2 = vector()
521 res3 = vector()
522 res12 = vector()
523 res13 = vector()
524 res23 = vector()
525 resAll = vector()
526 read1Count = vector()
527 read2Count = vector()
528 read3Count = vector()
529
530 if(appendTriplets){
531 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
532 }
533 for(iter in 1:length(product[,1])){
534 threshhold = product[iter,threshholdIndex]
535 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
536 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
537 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
538 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
539
540 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
541 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
542 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
543
544 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
545 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
546 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
547
548 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
549 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
550 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
551 res1 = append(res1, sum(one))
552 res2 = append(res2, sum(two))
553 res3 = append(res3, sum(three))
554 resAll = append(resAll, sum(all))
555 res12 = append(res12, sum(one_two))
556 res13 = append(res13, sum(one_three))
557 res23 = append(res23, sum(two_three))
558 #threshhold = 0
559 if(threshhold != 0){
560 if(sum(one) > 0){
561 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
562 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
563 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
564 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
565 }
566 if(sum(two) > 0){
567 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
568 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
569 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
570 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
571 }
572 if(sum(three) > 0){
573 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
574 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
575 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
576 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
577 }
578 if(sum(one_two) > 0){
579 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
580 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
581 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
582 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
583 }
584 if(sum(one_three) > 0){
585 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
586 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
587 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
588 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
589 }
590 if(sum(two_three) > 0){
591 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
592 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
593 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
594 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
595 }
596 } else { #scatterplot data
597 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
598 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
599 if(sum(in_two) > 0){
600 scatterplot_locus_data[in_two,]$type = "In two"
601 }
602 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
603 if(sum(in_three)> 0){
604 scatterplot_locus_data[in_three,]$type = "In three"
605 }
606 not_in_one = scatterplot_locus_data$type != "In one"
607 if(sum(not_in_one) > 0){
608 scatterplot_locus_data[not_in_one,]$type = "In multiple"
609 }
610 p = NULL
611 if(nrow(scatterplot_locus_data) != 0){
612 if(on == "normalized_read_count"){
613 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
614 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
615 } else {
616 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
617 }
618 p = p + geom_point(aes(colour=type), position="jitter")
619 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
620 } else {
621 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
622 }
623 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
624 print(p)
625 dev.off()
626 }
627 if(sum(all) > 0){
628 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
629 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
630 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
631 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
632 }
633 }
634 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
635 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
636 colnames(patientResult)[6] = oneSample
637 colnames(patientResult)[7] = twoSample
638 colnames(patientResult)[8] = threeSample
639 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
640 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
641 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
642
643 colnamesBak = colnames(patientResult)
644 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
645 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
646 colnames(patientResult) = colnamesBak
647
648 patientResult$Locus = factor(patientResult$Locus, Titles)
649 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
650
651 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
652 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
653 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
654 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
655 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
656 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
657 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
658 print(plt)
659 dev.off()
660
661 fontSize = 4
662
663 bak = patientResult
664 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
665 patientResult$relativeValue = patientResult$value * 10
666 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
667 plt = ggplot(patientResult)
668 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
669 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
670 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
671 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
672 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
673 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
674 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
675 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
676 print(plt)
677 dev.off()
678 }
679
680 if(nrow(triplets) != 0){
681 triplets$uniqueID = "ID"
682
683 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
684 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
685 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
686
687 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
688 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
689 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
690
691 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
692
693 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
694 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
695 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
696
697 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
698 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
699
700 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
701
702 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
703
704 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
705
706 triplets = triplets[triplets$normalized_read_count >= min_cells,]
707
708 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
709
710 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
711
712 #remove duplicate V+J+CDR3, add together numerical values
713 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
714 Cell_Count=unique(.SD$Cell_Count),
715 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
716 Total_Read_Count=sum(.SD$Total_Read_Count),
717 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
718 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
719 Frequency=sum(.SD$Frequency),
720 normalized_read_count=sum(.SD$normalized_read_count),
721 Log10_Frequency=sum(.SD$Log10_Frequency),
722 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
723
724
725 interval = intervalReads
726 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
727 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
728
729 one = triplets[triplets$Sample == "14696_reg_BM",]
730 two = triplets[triplets$Sample == "24536_reg_BM",]
731 three = triplets[triplets$Sample == "24062_reg_BM",]
732 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
733
734 one = triplets[triplets$Sample == "16278_Left",]
735 two = triplets[triplets$Sample == "26402_Left",]
736 three = triplets[triplets$Sample == "26759_Left",]
737 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
738
739 one = triplets[triplets$Sample == "16278_Right",]
740 two = triplets[triplets$Sample == "26402_Right",]
741 three = triplets[triplets$Sample == "26759_Right",]
742 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
743
744
745 interval = intervalFreq
746 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
747 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
748
749 one = triplets[triplets$Sample == "14696_reg_BM",]
750 two = triplets[triplets$Sample == "24536_reg_BM",]
751 three = triplets[triplets$Sample == "24062_reg_BM",]
752 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
753
754 one = triplets[triplets$Sample == "16278_Left",]
755 two = triplets[triplets$Sample == "26402_Left",]
756 three = triplets[triplets$Sample == "26759_Left",]
757 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
758
759 one = triplets[triplets$Sample == "16278_Right",]
760 two = triplets[triplets$Sample == "26402_Right",]
761 three = triplets[triplets$Sample == "26759_Right",]
762 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
763 } else {
764 cat("", file="triplets.txt")
765 }