0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
22
|
57 patients = split(dat, dat$Patient, drop=T)
|
9
|
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
63 Titles = factor(Titles, levels=Titles)
|
|
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
65
|
29
|
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
67
|
51
|
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
69 patient.merge.list.second = list()
|
|
70
|
0
|
71 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
72 if (!is.data.frame(x) & is.list(x)){
|
|
73 x = x[[1]]
|
|
74 }
|
34
|
75 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
76 x = data.frame(x,stringsAsFactors=F)
|
0
|
77 onShort = "reads"
|
|
78 if(on == "Frequency"){
|
|
79 onShort = "freq"
|
|
80 }
|
18
|
81 onx = paste(on, ".x", sep="")
|
|
82 ony = paste(on, ".y", sep="")
|
0
|
83 splt = split(x, x$Sample, drop=T)
|
4
|
84 type="pair"
|
2
|
85 if(length(splt) == 1){
|
3
|
86 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
87 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
88 type="single"
|
2
|
89 }
|
0
|
90 patient1 = splt[[1]]
|
|
91 patient2 = splt[[2]]
|
|
92
|
|
93 threshholdIndex = which(colnames(product) == "interval")
|
|
94 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
95 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
96 titleIndex = which(colnames(product) == "Titles")
|
|
97 sampleIndex = which(colnames(x) == "Sample")
|
|
98 patientIndex = which(colnames(x) == "Patient")
|
|
99 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
100 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
101 patient = paste(x[1,patientIndex])
|
3
|
102
|
0
|
103 switched = F
|
|
104 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
105 tmp = twoSample
|
|
106 twoSample = oneSample
|
|
107 oneSample = tmp
|
|
108 tmp = patient1
|
|
109 patient1 = patient2
|
|
110 patient2 = tmp
|
|
111 switched = T
|
|
112 }
|
|
113 if(appendtxt){
|
4
|
114 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
115 }
|
51
|
116 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
117
|
29
|
118 if(mergeOn == "Clone_Sequence"){
|
|
119 patient1$merge = paste(patient1$Clone_Sequence)
|
|
120 patient2$merge = paste(patient2$Clone_Sequence)
|
|
121 } else {
|
|
122 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
123 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
124 }
|
9
|
125
|
30
|
126 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
127 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
128 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
129 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
130 scatterplot_data$on = onShort
|
|
131
|
49
|
132 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
133 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
134
|
|
135 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
136
|
51
|
137 start.time = proc.time()
|
|
138 merge.list = c()
|
|
139
|
|
140 if(patient %in% names(patient.merge.list)){
|
|
141 patientMerge = patient.merge.list[[patient]]
|
|
142 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
143 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
144
|
|
145 print(names(patient.merge.list))
|
|
146 } else {
|
|
147 #fuzzy matching here...
|
49
|
148 #merge.list = patientMerge$merge
|
51
|
149
|
49
|
150 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
151 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
152
|
|
153 patient1.fuzzy = patient1
|
|
154 patient2.fuzzy = patient2
|
|
155
|
36
|
156 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
157 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
158
|
48
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
161
|
48
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
164
|
49
|
165 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
166 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
167
|
49
|
168 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
173
|
|
174 merge.list = list()
|
|
175
|
|
176 merge.list[["second"]] = vector()
|
|
177
|
|
178
|
37
|
179 while(nrow(patient.fuzzy) > 1){
|
|
180 first.merge = patient.fuzzy[1,"merge"]
|
|
181 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
182 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
183 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
184
|
49
|
185 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
186
|
49
|
187 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
188 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
189
|
|
190 #first match same sample, sum to a single row, same for other sample
|
|
191 #then merge rows like 'normal'
|
51
|
192
|
48
|
193 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
194
|
|
195
|
|
196
|
44
|
197 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
198 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
199 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
200
|
|
201 first.rows = patient.fuzzy[first.match.filter,]
|
|
202 second.rows = patient.fuzzy[second.match.filter,]
|
|
203
|
50
|
204 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
205 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
206 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
207 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
208
|
49
|
209 first.sum = data.frame(merge = first.clone.sequence,
|
|
210 Patient = patient,
|
|
211 Receptor = first.rows[1,"Receptor"],
|
|
212 Sample = first.rows[1,"Sample"],
|
|
213 Cell_Count = first.rows[1,"Cell_Count"],
|
|
214 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
215 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
216 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
217 dsPerM = sum(first.rows$dsPerM),
|
50
|
218 J_Segment_Major_Gene = first.rows.j,
|
|
219 V_Segment_Major_Gene = first.rows.v,
|
49
|
220 Clone_Sequence = first.clone.sequence,
|
|
221 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
222 Related_to_leukemia_clone = F,
|
|
223 Frequency = sum(first.rows$Frequency),
|
|
224 locus_V = first.rows[1,"locus_V"],
|
|
225 locus_J = first.rows[1,"locus_J"],
|
|
226 min_cell_count = first.rows[1,"min_cell_count"],
|
|
227 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
228 paste = first.rows[1,"paste"],
|
|
229 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
230
|
|
231 if(nrow(second.rows) > 0){
|
50
|
232 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
233 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
234 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
235 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
236
|
49
|
237 second.sum = data.frame(merge = first.clone.sequence,
|
|
238 Patient = patient,
|
|
239 Receptor = second.rows[1,"Receptor"],
|
|
240 Sample = second.rows[1,"Sample"],
|
|
241 Cell_Count = second.rows[1,"Cell_Count"],
|
|
242 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
243 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
244 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
245 dsPerM = sum(second.rows$dsPerM),
|
50
|
246 J_Segment_Major_Gene = second.rows.j,
|
|
247 V_Segment_Major_Gene = second.rows.v,
|
49
|
248 Clone_Sequence = first.clone.sequence,
|
|
249 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
250 Related_to_leukemia_clone = F,
|
|
251 Frequency = sum(second.rows$Frequency),
|
|
252 locus_V = second.rows[1,"locus_V"],
|
|
253 locus_J = second.rows[1,"locus_J"],
|
|
254 min_cell_count = second.rows[1,"min_cell_count"],
|
|
255 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
256 paste = second.rows[1,"paste"],
|
|
257 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
258
|
|
259 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
260 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
261
|
50
|
262 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
263 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
264
|
|
265 if(sum(first.match.filter) == 1 & sum(second.match.filter) == 1){
|
|
266 second.clone.sequence = patient.fuzzy[second.match.filter, "Clone_Sequence"]
|
50
|
267 if(first.clone.sequence != second.clone.sequence){
|
|
268 #merge.list[["second"]] = append(merge.list[["second"]], second.clone.sequence)
|
49
|
269 }
|
|
270 }
|
|
271
|
|
272 if(nrow(first.rows) > 1 | nrow(second.rows) > 1){
|
|
273
|
|
274 }
|
|
275
|
37
|
276 } else {
|
|
277 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
278 }
|
|
279 }
|
51
|
280 patient.merge.list[[patient]] <<- patientMerge
|
|
281 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
282 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
283 }
|
51
|
284
|
52
|
285 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
286 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
287
|
37
|
288
|
18
|
289 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
290 res1 = vector()
|
|
291 res2 = vector()
|
|
292 resBoth = vector()
|
|
293 read1Count = vector()
|
|
294 read2Count = vector()
|
2
|
295 locussum1 = vector()
|
|
296 locussum2 = vector()
|
9
|
297
|
0
|
298 #for(iter in 1){
|
|
299 for(iter in 1:length(product[,1])){
|
|
300 threshhold = product[iter,threshholdIndex]
|
|
301 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
302 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
303 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
304 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
305 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
306 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
307 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
308 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
309 res1 = append(res1, sum(one))
|
2
|
310 res2 = append(res2, sum(two))
|
0
|
311 resBoth = append(resBoth, sum(both))
|
2
|
312 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
313 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
314 #threshhold = 0
|
|
315 if(threshhold != 0){
|
|
316 if(sum(one) > 0){
|
15
|
317 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
318 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
319 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
320 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
321 }
|
|
322 if(sum(two) > 0){
|
15
|
323 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
324 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
325 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
326 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
327 }
|
29
|
328 } else {
|
|
329 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
330 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
|
331 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
332 if(nrow(scatterplot_locus_data) > 0){
|
|
333 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
334 }
|
34
|
335 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
336 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
337 if(any(in_two)){
|
49
|
338 scatterplot_locus_data[in_two,]$type = twoSample
|
34
|
339 }
|
49
|
340 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
341 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
342 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
30
|
343 if(any(in_both)){
|
|
344 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
345 }
|
29
|
346 if(type == "single"){
|
|
347 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
348 }
|
|
349 p = NULL
|
|
350 if(nrow(scatterplot_locus_data) != 0){
|
|
351 if(on == "normalized_read_count"){
|
30
|
352 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
49
|
353 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
354 } else {
|
49
|
355 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
356 }
|
|
357 p = p + geom_point(aes(colour=type), position="jitter")
|
|
358 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
359 } else {
|
|
360 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
361 }
|
|
362 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
363 print(p)
|
|
364 dev.off()
|
0
|
365 }
|
|
366 if(sum(both) > 0){
|
15
|
367 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
368 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
369 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
370 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
371 }
|
0
|
372 }
|
2
|
373 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
374 if(sum(is.na(patientResult$percentage)) > 0){
|
|
375 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
376 }
|
|
377 colnames(patientResult)[6] = oneSample
|
|
378 colnames(patientResult)[8] = twoSample
|
|
379 colnamesBak = colnames(patientResult)
|
2
|
380 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
381 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
382 colnames(patientResult) = colnamesBak
|
|
383
|
|
384 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
385 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
386
|
|
387 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
388 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
389 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
390 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
391 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
392 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
393 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
394 print(plt)
|
|
395 dev.off()
|
|
396 #(t,r,b,l)
|
|
397 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
398 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
399 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
400 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
401 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
402 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
403 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
404 print(plt)
|
|
405 dev.off()
|
|
406
|
|
407 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
408 patientResult$relativeValue = patientResult$value * 10
|
|
409 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
410 plt = ggplot(patientResult)
|
|
411 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
412 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
413 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
414 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
415 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
416 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
417 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
418 print(plt)
|
|
419 dev.off()
|
|
420 }
|
|
421
|
3
|
422 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
423
|
0
|
424 interval = intervalFreq
|
|
425 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
426 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
427 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
428
|
3
|
429 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
430
|
0
|
431 interval = intervalReads
|
|
432 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
433 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
434 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
435
|
3
|
436 cat("</table></html>", file=logfile, append=T)
|
|
437
|
33
|
438
|
|
439 if(nrow(single_patients) > 0){
|
|
440 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
441 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
442 p = p + geom_point(aes(colour=type), position="jitter")
|
|
443 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
444 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
445 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
446 print(p)
|
|
447 dev.off()
|
7
|
448
|
33
|
449 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
450 p = p + geom_point(aes(colour=type), position="jitter")
|
|
451 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
452 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
453 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
454 print(p)
|
|
455 dev.off()
|
|
456 } else {
|
|
457 empty <- data.frame()
|
|
458 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
459
|
|
460 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
461 print(p)
|
|
462 dev.off()
|
|
463
|
|
464 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
465 print(p)
|
|
466 dev.off()
|
|
467 }
|
7
|
468 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
469 onShort = "reads"
|
|
470 if(on == "Frequency"){
|
|
471 onShort = "freq"
|
|
472 }
|
18
|
473 onx = paste(on, ".x", sep="")
|
|
474 ony = paste(on, ".y", sep="")
|
|
475 onz = paste(on, ".z", sep="")
|
7
|
476 type="triplet"
|
|
477
|
|
478 threshholdIndex = which(colnames(product) == "interval")
|
|
479 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
480 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
481 titleIndex = which(colnames(product) == "Titles")
|
|
482 sampleIndex = which(colnames(patient1) == "Sample")
|
|
483 patientIndex = which(colnames(patient1) == "Patient")
|
|
484 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
485 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
486 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
487
|
29
|
488 if(mergeOn == "Clone_Sequence"){
|
|
489 patient1$merge = paste(patient1$Clone_Sequence)
|
|
490 patient2$merge = paste(patient2$Clone_Sequence)
|
|
491 patient3$merge = paste(patient3$Clone_Sequence)
|
|
492
|
|
493 } else {
|
|
494 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
495 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
496 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
497 }
|
9
|
498
|
|
499 patientMerge = merge(patient1, patient2, by="merge")
|
|
500 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
501 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
502 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
503 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
504 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
505 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
506 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
507 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
508 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
509
|
28
|
510
|
30
|
511 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
512 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
513 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
514 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
515
|
7
|
516 res1 = vector()
|
|
517 res2 = vector()
|
|
518 res3 = vector()
|
20
|
519 res12 = vector()
|
|
520 res13 = vector()
|
|
521 res23 = vector()
|
7
|
522 resAll = vector()
|
|
523 read1Count = vector()
|
|
524 read2Count = vector()
|
|
525 read3Count = vector()
|
|
526
|
|
527 if(appendTriplets){
|
9
|
528 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
529 }
|
|
530 for(iter in 1:length(product[,1])){
|
|
531 threshhold = product[iter,threshholdIndex]
|
|
532 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
533 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
534 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
535 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
536
|
30
|
537 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
538 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
539 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
540
|
30
|
541 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
542 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
543 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
544
|
18
|
545 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
546 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
547 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
548 res1 = append(res1, sum(one))
|
|
549 res2 = append(res2, sum(two))
|
|
550 res3 = append(res3, sum(three))
|
|
551 resAll = append(resAll, sum(all))
|
20
|
552 res12 = append(res12, sum(one_two))
|
|
553 res13 = append(res13, sum(one_three))
|
|
554 res23 = append(res23, sum(two_three))
|
7
|
555 #threshhold = 0
|
|
556 if(threshhold != 0){
|
|
557 if(sum(one) > 0){
|
20
|
558 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
559 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
560 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
561 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
562 }
|
|
563 if(sum(two) > 0){
|
20
|
564 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
565 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
566 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
567 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
568 }
|
|
569 if(sum(three) > 0){
|
20
|
570 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
571 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
572 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
573 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
574 }
|
20
|
575 if(sum(one_two) > 0){
|
|
576 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
577 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
578 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
579 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
580 }
|
|
581 if(sum(one_three) > 0){
|
|
582 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
583 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
584 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
585 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
586 }
|
|
587 if(sum(two_three) > 0){
|
|
588 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
589 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
590 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
591 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
592 }
|
|
593 } else { #scatterplot data
|
24
|
594 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
595 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
596 if(sum(in_two) > 0){
|
|
597 scatterplot_locus_data[in_two,]$type = "In two"
|
|
598 }
|
30
|
599 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
600 if(sum(in_three)> 0){
|
|
601 scatterplot_locus_data[in_three,]$type = "In three"
|
|
602 }
|
|
603 not_in_one = scatterplot_locus_data$type != "In one"
|
|
604 if(sum(not_in_one) > 0){
|
|
605 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
606 }
|
20
|
607 p = NULL
|
|
608 if(nrow(scatterplot_locus_data) != 0){
|
|
609 if(on == "normalized_read_count"){
|
31
|
610 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
611 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
612 } else {
|
32
|
613 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
614 }
|
|
615 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
616 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
617 } else {
|
|
618 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
619 }
|
|
620 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
621 print(p)
|
|
622 dev.off()
|
|
623 }
|
7
|
624 if(sum(all) > 0){
|
20
|
625 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
626 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
627 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
628 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
629 }
|
|
630 }
|
20
|
631 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
632 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
633 colnames(patientResult)[6] = oneSample
|
20
|
634 colnames(patientResult)[7] = twoSample
|
|
635 colnames(patientResult)[8] = threeSample
|
|
636 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
637 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
638 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
639
|
|
640 colnamesBak = colnames(patientResult)
|
20
|
641 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
642 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
643 colnames(patientResult) = colnamesBak
|
|
644
|
|
645 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
646 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
647
|
|
648 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
649 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
650 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
651 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
652 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
653 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
654 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
655 print(plt)
|
|
656 dev.off()
|
|
657
|
|
658 fontSize = 4
|
|
659
|
|
660 bak = patientResult
|
|
661 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
662 patientResult$relativeValue = patientResult$value * 10
|
|
663 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
664 plt = ggplot(patientResult)
|
|
665 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
666 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
667 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
668 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
669 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
670 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
671 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
672 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
673 print(plt)
|
|
674 dev.off()
|
|
675 }
|
|
676
|
33
|
677 if(nrow(triplets) != 0){
|
|
678 triplets$uniqueID = "ID"
|
|
679
|
|
680 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
681 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
682 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
683
|
|
684 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
685 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
686 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
687
|
|
688 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
689
|
|
690 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
691 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
692 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
693
|
|
694 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
695 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
696
|
|
697 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
698
|
|
699 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
700
|
|
701 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
702
|
|
703 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
704
|
|
705 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
706
|
|
707 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
708
|
|
709 #remove duplicate V+J+CDR3, add together numerical values
|
|
710 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
711 Cell_Count=unique(.SD$Cell_Count),
|
|
712 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
713 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
714 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
715 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
716 Frequency=sum(.SD$Frequency),
|
|
717 normalized_read_count=sum(.SD$normalized_read_count),
|
|
718 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
719 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
720
|
|
721
|
|
722 interval = intervalReads
|
|
723 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
724 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
725
|
|
726 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
727 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
728 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
729 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
730
|
|
731 one = triplets[triplets$Sample == "16278_Left",]
|
|
732 two = triplets[triplets$Sample == "26402_Left",]
|
|
733 three = triplets[triplets$Sample == "26759_Left",]
|
|
734 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
735
|
|
736 one = triplets[triplets$Sample == "16278_Right",]
|
|
737 two = triplets[triplets$Sample == "26402_Right",]
|
|
738 three = triplets[triplets$Sample == "26759_Right",]
|
|
739 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
740
|
|
741
|
|
742 interval = intervalFreq
|
|
743 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
744 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
745
|
|
746 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
747 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
748 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
749 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
750
|
|
751 one = triplets[triplets$Sample == "16278_Left",]
|
|
752 two = triplets[triplets$Sample == "26402_Left",]
|
|
753 three = triplets[triplets$Sample == "26759_Left",]
|
|
754 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
755
|
|
756 one = triplets[triplets$Sample == "16278_Right",]
|
|
757 two = triplets[triplets$Sample == "26402_Right",]
|
|
758 three = triplets[triplets$Sample == "26759_Right",]
|
|
759 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
760 } else {
|
|
761 cat("", file="triplets.txt")
|
|
762 }
|