0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
68
|
2 options(scipen=999)
|
0
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
3
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
29
|
9 mergeOn = args[6]
|
3
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
12
|
4
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
0
|
18 #require(xtable)
|
3
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
66
|
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
34
|
22 dat$dsPerM = 0
|
9
|
23 dat = dat[!is.na(dat$Patient),]
|
13
|
24 dat$Related_to_leukemia_clone = F
|
9
|
25
|
0
|
26 setwd(outDir)
|
3
|
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
30
|
3
|
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
32
|
13
|
33 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
34
|
3
|
35 dat = dat[dat$Frequency >= min_freq,]
|
|
36
|
13
|
37 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
38
|
|
39 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
40
|
|
41 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
42 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
43 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
44
|
|
45 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
46 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
47
|
|
48 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
49 print(paste("rows:", nrow(dat)))
|
13
|
50 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
51 print(paste("rows:", nrow(dat)))
|
13
|
52 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
53
|
3
|
54 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
55
|
|
56 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
57
|
22
|
58 patients = split(dat, dat$Patient, drop=T)
|
9
|
59 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
60 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
61 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
62 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
63 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
64 Titles = factor(Titles, levels=Titles)
|
|
65 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
66
|
29
|
67 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
68
|
51
|
69 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
70 patient.merge.list.second = list()
|
68
|
71 scatter_locus_data_list = list()
|
56
|
72 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
73 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
0
|
74 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
75 if (!is.data.frame(x) & is.list(x)){
|
|
76 x = x[[1]]
|
|
77 }
|
34
|
78 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
79 x = data.frame(x,stringsAsFactors=F)
|
0
|
80 onShort = "reads"
|
|
81 if(on == "Frequency"){
|
|
82 onShort = "freq"
|
|
83 }
|
18
|
84 onx = paste(on, ".x", sep="")
|
|
85 ony = paste(on, ".y", sep="")
|
0
|
86 splt = split(x, x$Sample, drop=T)
|
4
|
87 type="pair"
|
2
|
88 if(length(splt) == 1){
|
3
|
89 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
90 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
91 type="single"
|
2
|
92 }
|
0
|
93 patient1 = splt[[1]]
|
|
94 patient2 = splt[[2]]
|
|
95
|
|
96 threshholdIndex = which(colnames(product) == "interval")
|
|
97 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
98 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
99 titleIndex = which(colnames(product) == "Titles")
|
|
100 sampleIndex = which(colnames(x) == "Sample")
|
|
101 patientIndex = which(colnames(x) == "Patient")
|
|
102 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
103 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
104 patient = paste(x[1,patientIndex])
|
3
|
105
|
0
|
106 switched = F
|
|
107 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
108 tmp = twoSample
|
|
109 twoSample = oneSample
|
|
110 oneSample = tmp
|
|
111 tmp = patient1
|
|
112 patient1 = patient2
|
|
113 patient2 = tmp
|
|
114 switched = T
|
|
115 }
|
|
116 if(appendtxt){
|
4
|
117 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
118 }
|
51
|
119 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
120
|
29
|
121 if(mergeOn == "Clone_Sequence"){
|
|
122 patient1$merge = paste(patient1$Clone_Sequence)
|
|
123 patient2$merge = paste(patient2$Clone_Sequence)
|
|
124 } else {
|
|
125 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
126 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
127 }
|
9
|
128
|
30
|
129 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
68
|
130 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
|
131 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
132 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
133 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
|
134 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
135 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
|
|
136 scatterplot_data$type = character(0)
|
|
137 scatterplot_data$link = numeric(0)
|
|
138 scatterplot_data$on = character(0)
|
29
|
139
|
49
|
140 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
141 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
142
|
|
143 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
144
|
51
|
145 start.time = proc.time()
|
|
146 merge.list = c()
|
|
147
|
|
148 if(patient %in% names(patient.merge.list)){
|
|
149 patientMerge = patient.merge.list[[patient]]
|
|
150 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
68
|
151 scatterplot_data = scatter_locus_data_list[[patient]]
|
51
|
152 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
153
|
|
154 print(names(patient.merge.list))
|
|
155 } else {
|
|
156 #fuzzy matching here...
|
49
|
157 #merge.list = patientMerge$merge
|
51
|
158
|
49
|
159 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
160 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
161
|
|
162 patient1.fuzzy = patient1
|
|
163 patient2.fuzzy = patient2
|
|
164
|
36
|
165 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
166 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
167
|
48
|
168 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
169 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
170
|
48
|
171 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
172 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
173
|
49
|
174 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
175 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
176
|
49
|
177 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
178 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
179
|
37
|
180 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
181 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
182
|
|
183 merge.list = list()
|
|
184
|
|
185 merge.list[["second"]] = vector()
|
68
|
186
|
|
187 link.count = 1
|
|
188
|
37
|
189 while(nrow(patient.fuzzy) > 1){
|
|
190 first.merge = patient.fuzzy[1,"merge"]
|
|
191 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
192 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
193 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
194
|
49
|
195 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
196
|
49
|
197 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
198 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
199
|
|
200 #first match same sample, sum to a single row, same for other sample
|
|
201 #then merge rows like 'normal'
|
51
|
202
|
48
|
203 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
204
|
|
205
|
|
206
|
44
|
207 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
208 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
209 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
210
|
|
211 first.rows = patient.fuzzy[first.match.filter,]
|
|
212 second.rows = patient.fuzzy[second.match.filter,]
|
|
213
|
50
|
214 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
215 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
216 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
217 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
218
|
49
|
219 first.sum = data.frame(merge = first.clone.sequence,
|
|
220 Patient = patient,
|
|
221 Receptor = first.rows[1,"Receptor"],
|
|
222 Sample = first.rows[1,"Sample"],
|
|
223 Cell_Count = first.rows[1,"Cell_Count"],
|
|
224 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
225 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
226 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
227 dsPerM = sum(first.rows$dsPerM),
|
50
|
228 J_Segment_Major_Gene = first.rows.j,
|
|
229 V_Segment_Major_Gene = first.rows.v,
|
49
|
230 Clone_Sequence = first.clone.sequence,
|
|
231 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
232 Related_to_leukemia_clone = F,
|
|
233 Frequency = sum(first.rows$Frequency),
|
|
234 locus_V = first.rows[1,"locus_V"],
|
|
235 locus_J = first.rows[1,"locus_J"],
|
|
236 min_cell_count = first.rows[1,"min_cell_count"],
|
|
237 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
238 paste = first.rows[1,"paste"],
|
|
239 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
240
|
|
241 if(nrow(second.rows) > 0){
|
50
|
242 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
243 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
244 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
245 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
246
|
55
|
247 second.sum = data.frame(merge = first.clone.sequence,
|
49
|
248 Patient = patient,
|
|
249 Receptor = second.rows[1,"Receptor"],
|
|
250 Sample = second.rows[1,"Sample"],
|
|
251 Cell_Count = second.rows[1,"Cell_Count"],
|
|
252 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
253 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
254 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
255 dsPerM = sum(second.rows$dsPerM),
|
50
|
256 J_Segment_Major_Gene = second.rows.j,
|
|
257 V_Segment_Major_Gene = second.rows.v,
|
49
|
258 Clone_Sequence = first.clone.sequence,
|
|
259 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
260 Related_to_leukemia_clone = F,
|
|
261 Frequency = sum(second.rows$Frequency),
|
|
262 locus_V = second.rows[1,"locus_V"],
|
|
263 locus_J = second.rows[1,"locus_J"],
|
|
264 min_cell_count = second.rows[1,"min_cell_count"],
|
|
265 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
266 paste = second.rows[1,"paste"],
|
|
267 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
268
|
|
269 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
270 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
271
|
50
|
272 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
273 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
274
|
56
|
275 tmp.rows = rbind(first.rows, second.rows)
|
|
276 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
68
|
277
|
|
278
|
|
279 #add to the scatterplot data
|
|
280 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
281 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
282 scatterplot.row$link = link.count
|
|
283 scatterplot.row$on = onShort
|
|
284
|
|
285 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
286
|
|
287 scatterplot.row = second.sum[,scatterplot_data_columns]
|
|
288 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
289 scatterplot.row$link = link.count
|
|
290 scatterplot.row$on = onShort
|
|
291
|
|
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
293
|
|
294 #write some information about the match to a log file
|
56
|
295 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
296 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
297 } else {
|
|
298 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
299 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
300 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
301 } else {
|
58
|
302 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
49
|
303 }
|
|
304 }
|
|
305
|
59
|
306 } else if(nrow(first.rows) > 1) {
|
|
307 if(patient1[1,"Sample"] == first.sample){
|
|
308 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
309 patient1 = rbind(patient1, first.sum)
|
|
310 } else {
|
|
311 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
312 patient2 = rbind(patient2, first.sum)
|
|
313 }
|
|
314
|
|
315 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
316 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
317
|
|
318 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
68
|
319
|
|
320 #add to the scatterplot data
|
|
321 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
322 scatterplot.row$type = first.sum[,"Sample"]
|
|
323 scatterplot.row$link = link.count
|
|
324 scatterplot.row$on = onShort
|
|
325
|
|
326 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
59
|
327
|
|
328 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
37
|
329 } else {
|
|
330 patient.fuzzy = patient.fuzzy[-1,]
|
68
|
331
|
|
332 #add to the scatterplot data
|
|
333 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
334 scatterplot.row$type = first.sum[,"Sample"]
|
|
335 scatterplot.row$link = link.count
|
|
336 scatterplot.row$on = onShort
|
|
337
|
|
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
34
|
339 }
|
68
|
340 link.count = link.count + 1
|
34
|
341 }
|
51
|
342 patient.merge.list[[patient]] <<- patientMerge
|
|
343 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
68
|
344 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
51
|
345 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
346 }
|
51
|
347
|
52
|
348 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
349 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
350
|
37
|
351
|
18
|
352 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
68
|
353 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
0
|
354 res1 = vector()
|
|
355 res2 = vector()
|
|
356 resBoth = vector()
|
|
357 read1Count = vector()
|
|
358 read2Count = vector()
|
2
|
359 locussum1 = vector()
|
|
360 locussum2 = vector()
|
9
|
361
|
0
|
362 #for(iter in 1){
|
|
363 for(iter in 1:length(product[,1])){
|
|
364 threshhold = product[iter,threshholdIndex]
|
|
365 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
366 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
367 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
368 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
369 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
370 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
371 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
372 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
373 res1 = append(res1, sum(one))
|
2
|
374 res2 = append(res2, sum(two))
|
0
|
375 resBoth = append(resBoth, sum(both))
|
2
|
376 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
377 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
378 #threshhold = 0
|
|
379 if(threshhold != 0){
|
|
380 if(sum(one) > 0){
|
15
|
381 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
382 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
383 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
384 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
385 }
|
|
386 if(sum(two) > 0){
|
15
|
387 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
388 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
389 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
390 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
391 }
|
29
|
392 } else {
|
|
393 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
394 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
68
|
395 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
396 if(nrow(scatterplot_locus_data) > 0){
|
|
397 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
398 }
|
68
|
399
|
|
400
|
|
401
|
|
402 #in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
403 #in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
404 #if(any(in_two)){
|
|
405 # scatterplot_locus_data[in_two,]$type = twoSample
|
|
406 #}
|
|
407 #in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
408 ##merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
409 ##exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
|
410 #if(any(in_both)){
|
|
411 # scatterplot_locus_data[in_both,]$type = "In Both"
|
|
412 #}
|
|
413 #if(type == "single" & (nrow(scatterplot_locus_data) > 0 | !any(scatterplot_locus_data$Patient %in% single_patients$Patient))){
|
|
414 # single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
415 #}
|
|
416
|
|
417
|
29
|
418 p = NULL
|
68
|
419 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
29
|
420 if(nrow(scatterplot_locus_data) != 0){
|
|
421 if(on == "normalized_read_count"){
|
30
|
422 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
68
|
423 #p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
424 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
425 } else {
|
68
|
426 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
427 p = ggplot(scatterplot_locus_data, aes(type, Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
428 }
|
68
|
429 #p = p + geom_point(aes(colour=type), position="jitter")
|
|
430 p = p + geom_point(aes(colour=type), position="dodge")
|
29
|
431 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
432 } else {
|
|
433 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
434 }
|
|
435 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
436 print(p)
|
|
437 dev.off()
|
0
|
438 }
|
|
439 if(sum(both) > 0){
|
15
|
440 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
441 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
442 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
443 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
444 }
|
0
|
445 }
|
2
|
446 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
447 if(sum(is.na(patientResult$percentage)) > 0){
|
|
448 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
449 }
|
|
450 colnames(patientResult)[6] = oneSample
|
|
451 colnames(patientResult)[8] = twoSample
|
|
452 colnamesBak = colnames(patientResult)
|
2
|
453 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
454 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
455 colnames(patientResult) = colnamesBak
|
|
456
|
|
457 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
458 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
459
|
|
460 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
461 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
462 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
463 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
464 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
465 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
466 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
467 print(plt)
|
|
468 dev.off()
|
|
469 #(t,r,b,l)
|
|
470 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
471 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
472 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
473 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
474 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
475 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
476 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
477 print(plt)
|
|
478 dev.off()
|
|
479
|
|
480 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
481 patientResult$relativeValue = patientResult$value * 10
|
|
482 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
483 plt = ggplot(patientResult)
|
|
484 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
485 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
486 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
487 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
488 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
489 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
490 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
491 print(plt)
|
|
492 dev.off()
|
|
493 }
|
|
494
|
3
|
495 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
496
|
0
|
497 interval = intervalFreq
|
|
498 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
499 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
500 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
501
|
3
|
502 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
503
|
0
|
504 interval = intervalReads
|
|
505 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
506 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
507 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
33
|
508
|
|
509 if(nrow(single_patients) > 0){
|
|
510 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
68
|
511 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
33
|
512 p = p + geom_point(aes(colour=type), position="jitter")
|
|
513 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
514 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
515 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
516 print(p)
|
|
517 dev.off()
|
7
|
518
|
68
|
519 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
520 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
33
|
521 p = p + geom_point(aes(colour=type), position="jitter")
|
|
522 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
523 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
524 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
525 print(p)
|
|
526 dev.off()
|
|
527 } else {
|
|
528 empty <- data.frame()
|
|
529 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
530
|
|
531 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
532 print(p)
|
|
533 dev.off()
|
|
534
|
|
535 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
536 print(p)
|
|
537 dev.off()
|
|
538 }
|
60
|
539
|
|
540 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
541 patient.merge.list.second = list()
|
|
542
|
7
|
543 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
544 onShort = "reads"
|
|
545 if(on == "Frequency"){
|
|
546 onShort = "freq"
|
|
547 }
|
18
|
548 onx = paste(on, ".x", sep="")
|
|
549 ony = paste(on, ".y", sep="")
|
|
550 onz = paste(on, ".z", sep="")
|
7
|
551 type="triplet"
|
|
552
|
|
553 threshholdIndex = which(colnames(product) == "interval")
|
|
554 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
555 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
556 titleIndex = which(colnames(product) == "Titles")
|
|
557 sampleIndex = which(colnames(patient1) == "Sample")
|
|
558 patientIndex = which(colnames(patient1) == "Patient")
|
|
559 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
560 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
561 threeSample = paste(patient3[1,sampleIndex], sep="")
|
60
|
562
|
29
|
563 if(mergeOn == "Clone_Sequence"){
|
|
564 patient1$merge = paste(patient1$Clone_Sequence)
|
|
565 patient2$merge = paste(patient2$Clone_Sequence)
|
|
566 patient3$merge = paste(patient3$Clone_Sequence)
|
|
567
|
|
568 } else {
|
|
569 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
570 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
571 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
572 }
|
60
|
573
|
|
574 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
575 patient1.fuzzy = patient1
|
|
576 patient2.fuzzy = patient2
|
|
577 patient3.fuzzy = patient3
|
|
578
|
|
579 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
|
580
|
|
581 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
582 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
583 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
|
584
|
|
585 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
65
|
586 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
60
|
587
|
|
588 other.sample.list = list()
|
|
589 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
590 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
591 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
|
592
|
9
|
593 patientMerge = merge(patient1, patient2, by="merge")
|
|
594 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
595 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
60
|
596 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
597 patientMerge = patientMerge[NULL,]
|
|
598
|
|
599 duo.merge.list = list()
|
|
600
|
20
|
601 patientMerge12 = merge(patient1, patient2, by="merge")
|
60
|
602 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
603 patientMerge12 = patientMerge12[NULL,]
|
|
604 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
605 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
|
606
|
20
|
607 patientMerge13 = merge(patient1, patient3, by="merge")
|
60
|
608 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
609 patientMerge13 = patientMerge13[NULL,]
|
|
610 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
611 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
|
612
|
20
|
613 patientMerge23 = merge(patient2, patient3, by="merge")
|
60
|
614 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
615 patientMerge23 = patientMerge23[NULL,]
|
|
616 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
617 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
|
618
|
|
619 merge.list = list()
|
|
620 merge.list[["second"]] = vector()
|
|
621
|
|
622 start.time = proc.time()
|
|
623 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
624 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
625 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
626 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
627 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
|
628
|
|
629 merge.list[["second"]] = patient.merge.list.second[[label1]]
|
|
630
|
|
631 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
632 } else {
|
|
633 while(nrow(patient.fuzzy) > 0){
|
|
634 first.merge = patient.fuzzy[1,"merge"]
|
|
635 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
636 first.sample = patient.fuzzy[1,"Sample"]
|
|
637
|
|
638 merge.filter = first.merge == patient.fuzzy$merge
|
|
639
|
|
640 second.sample = other.sample.list[[first.sample]][1]
|
|
641 third.sample = other.sample.list[[first.sample]][2]
|
|
642
|
|
643 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
644 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
645 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
|
646
|
|
647 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
648
|
|
649 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
650 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
651 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
|
652
|
|
653 matches.in.1 = any(match.filter.1)
|
|
654 matches.in.2 = any(match.filter.2)
|
|
655 matches.in.3 = any(match.filter.3)
|
|
656
|
|
657
|
|
658
|
|
659 rows.1 = patient.fuzzy[match.filter.1,]
|
|
660
|
|
661 sum.1 = data.frame(merge = first.clone.sequence,
|
|
662 Patient = label1,
|
|
663 Receptor = rows.1[1,"Receptor"],
|
|
664 Sample = rows.1[1,"Sample"],
|
|
665 Cell_Count = rows.1[1,"Cell_Count"],
|
|
666 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
667 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
668 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
669 dsPerM = sum(rows.1$dsPerM),
|
|
670 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
671 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
672 Clone_Sequence = first.clone.sequence,
|
|
673 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
674 Related_to_leukemia_clone = F,
|
|
675 Frequency = sum(rows.1$Frequency),
|
|
676 locus_V = rows.1[1,"locus_V"],
|
|
677 locus_J = rows.1[1,"locus_J"],
|
64
|
678 uniqueID = rows.1[1,"uniqueID"],
|
60
|
679 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
680 sum.2 = sum.1[NULL,]
|
|
681 rows.2 = patient.fuzzy[match.filter.2,]
|
|
682 if(matches.in.2){
|
|
683 sum.2 = data.frame(merge = first.clone.sequence,
|
|
684 Patient = label1,
|
|
685 Receptor = rows.2[1,"Receptor"],
|
|
686 Sample = rows.2[1,"Sample"],
|
|
687 Cell_Count = rows.2[1,"Cell_Count"],
|
|
688 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
689 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
690 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
691 dsPerM = sum(rows.2$dsPerM),
|
|
692 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
693 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
694 Clone_Sequence = first.clone.sequence,
|
|
695 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
696 Related_to_leukemia_clone = F,
|
|
697 Frequency = sum(rows.2$Frequency),
|
|
698 locus_V = rows.2[1,"locus_V"],
|
|
699 locus_J = rows.2[1,"locus_J"],
|
64
|
700 uniqueID = rows.2[1,"uniqueID"],
|
60
|
701 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
702 }
|
|
703
|
|
704 sum.3 = sum.1[NULL,]
|
|
705 rows.3 = patient.fuzzy[match.filter.3,]
|
|
706 if(matches.in.3){
|
|
707 sum.3 = data.frame(merge = first.clone.sequence,
|
|
708 Patient = label1,
|
|
709 Receptor = rows.3[1,"Receptor"],
|
|
710 Sample = rows.3[1,"Sample"],
|
|
711 Cell_Count = rows.3[1,"Cell_Count"],
|
|
712 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
713 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
714 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
715 dsPerM = sum(rows.3$dsPerM),
|
|
716 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
717 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
718 Clone_Sequence = first.clone.sequence,
|
|
719 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
720 Related_to_leukemia_clone = F,
|
|
721 Frequency = sum(rows.3$Frequency),
|
|
722 locus_V = rows.3[1,"locus_V"],
|
|
723 locus_J = rows.3[1,"locus_J"],
|
64
|
724 uniqueID = rows.3[1,"uniqueID"],
|
60
|
725 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
726 }
|
|
727
|
|
728 if(matches.in.2 & matches.in.3){
|
|
729 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
730 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
731 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
732 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
|
733
|
|
734 patientMerge = rbind(patientMerge, merge.123)
|
|
735 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
|
736
|
|
737 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
738 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
739
|
|
740 } else if (matches.in.2) {
|
|
741 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
742 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
743
|
|
744 second.sample = sum.2[,"Sample"]
|
|
745
|
|
746 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
747
|
|
748 merge.12 = merge(sum.1, sum.2, by="merge")
|
|
749
|
|
750 current.merge.list = rbind(current.merge.list, merge.12)
|
|
751 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
752
|
|
753 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
|
754
|
|
755 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
756 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
757
|
|
758 } else if (matches.in.3) {
|
|
759
|
|
760 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
761 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
762
|
|
763 second.sample = sum.3[,"Sample"]
|
|
764
|
|
765 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
766
|
|
767 merge.13 = merge(sum.1, sum.3, by="merge")
|
|
768
|
|
769 current.merge.list = rbind(current.merge.list, merge.13)
|
|
770 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
771
|
|
772 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
|
773
|
|
774 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
775 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
776
|
63
|
777 } else if(nrow(rows.1) > 1){
|
|
778 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
64
|
779 print(names(patient1)[names(patient1) %in% sum.1])
|
|
780 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
781 print(names(patient1))
|
|
782 print(names(sum.1))
|
|
783 print(summary(sum.1))
|
|
784 print(summary(patient1))
|
|
785 print(dim(sum.1))
|
|
786 print(dim(patient1))
|
|
787 print(head(sum.1[,names(patient1)]))
|
|
788 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
63
|
789 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
60
|
790 } else {
|
|
791 patient.fuzzy = patient.fuzzy[-1,]
|
|
792 }
|
|
793
|
|
794 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
795 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
796
|
|
797 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
798 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
799 } else {
|
|
800 }
|
|
801
|
|
802 }
|
|
803 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
804
|
|
805 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
806 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
807 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
|
808
|
|
809 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
810 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
811 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
812
|
|
813 patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
814 }
|
|
815 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
816 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
817 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
818 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
20
|
819 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
60
|
820
|
68
|
821 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
822 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
823 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
824 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
|
825
|
63
|
826 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
827 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
828 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
62
|
829
|
60
|
830 if(F){
|
|
831 patientMerge = merge(patient1, patient2, by="merge")
|
|
832 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
833 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
834 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
835 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
836 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
837 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
838 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
839 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
840 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
841 }
|
28
|
842
|
30
|
843 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
844 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
845 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
846 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
847
|
7
|
848 res1 = vector()
|
|
849 res2 = vector()
|
|
850 res3 = vector()
|
20
|
851 res12 = vector()
|
|
852 res13 = vector()
|
|
853 res23 = vector()
|
7
|
854 resAll = vector()
|
|
855 read1Count = vector()
|
|
856 read2Count = vector()
|
|
857 read3Count = vector()
|
|
858
|
|
859 if(appendTriplets){
|
9
|
860 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
861 }
|
|
862 for(iter in 1:length(product[,1])){
|
|
863 threshhold = product[iter,threshholdIndex]
|
|
864 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
865 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
866 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
867 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
868
|
30
|
869 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
870 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
871 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
872
|
30
|
873 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
874 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
875 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
876
|
18
|
877 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
878 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
879 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
880 res1 = append(res1, sum(one))
|
|
881 res2 = append(res2, sum(two))
|
|
882 res3 = append(res3, sum(three))
|
|
883 resAll = append(resAll, sum(all))
|
20
|
884 res12 = append(res12, sum(one_two))
|
|
885 res13 = append(res13, sum(one_three))
|
|
886 res23 = append(res23, sum(two_three))
|
7
|
887 #threshhold = 0
|
|
888 if(threshhold != 0){
|
|
889 if(sum(one) > 0){
|
20
|
890 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
891 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
892 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
893 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
894 }
|
|
895 if(sum(two) > 0){
|
20
|
896 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
897 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
898 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
899 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
900 }
|
|
901 if(sum(three) > 0){
|
20
|
902 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
903 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
904 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
905 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
906 }
|
20
|
907 if(sum(one_two) > 0){
|
|
908 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
909 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
910 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
911 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
912 }
|
|
913 if(sum(one_three) > 0){
|
|
914 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
915 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
916 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
917 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
918 }
|
|
919 if(sum(two_three) > 0){
|
|
920 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
921 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
922 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
923 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
924 }
|
|
925 } else { #scatterplot data
|
24
|
926 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
60
|
927 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
30
|
928 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
929 if(sum(in_two) > 0){
|
|
930 scatterplot_locus_data[in_two,]$type = "In two"
|
|
931 }
|
30
|
932 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
933 if(sum(in_three)> 0){
|
|
934 scatterplot_locus_data[in_three,]$type = "In three"
|
|
935 }
|
|
936 not_in_one = scatterplot_locus_data$type != "In one"
|
|
937 if(sum(not_in_one) > 0){
|
|
938 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
939 }
|
20
|
940 p = NULL
|
|
941 if(nrow(scatterplot_locus_data) != 0){
|
|
942 if(on == "normalized_read_count"){
|
31
|
943 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
944 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
945 } else {
|
68
|
946 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
947 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
948 }
|
|
949 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
950 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
951 } else {
|
|
952 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
953 }
|
|
954 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
955 print(p)
|
|
956 dev.off()
|
|
957 }
|
7
|
958 if(sum(all) > 0){
|
20
|
959 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
960 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
961 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
962 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
963 }
|
|
964 }
|
20
|
965 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
966 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
967 colnames(patientResult)[6] = oneSample
|
20
|
968 colnames(patientResult)[7] = twoSample
|
|
969 colnames(patientResult)[8] = threeSample
|
|
970 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
971 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
972 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
973
|
|
974 colnamesBak = colnames(patientResult)
|
20
|
975 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
976 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
977 colnames(patientResult) = colnamesBak
|
|
978
|
|
979 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
980 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
981
|
|
982 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
983 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
984 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
985 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
986 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
987 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
988 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
989 print(plt)
|
|
990 dev.off()
|
|
991
|
|
992 fontSize = 4
|
|
993
|
|
994 bak = patientResult
|
|
995 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
996 patientResult$relativeValue = patientResult$value * 10
|
|
997 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
998 plt = ggplot(patientResult)
|
|
999 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
1000 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
1001 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
1002 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1003 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1004 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1005 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1006 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1007 print(plt)
|
|
1008 dev.off()
|
|
1009 }
|
|
1010
|
67
|
1011 if(F & nrow(triplets) != 0){
|
60
|
1012
|
|
1013 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1014
|
33
|
1015 triplets$uniqueID = "ID"
|
|
1016
|
|
1017 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1018 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1019 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1020
|
|
1021 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1022 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1023 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1024
|
|
1025 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
60
|
1026
|
|
1027 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
1028
|
33
|
1029 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1030 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1031 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
1032
|
|
1033 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1034 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
1035
|
|
1036 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1037
|
|
1038 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1039
|
|
1040 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
1041
|
|
1042 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1043
|
60
|
1044 column_drops = c("min_cell_count", "min_cell_paste")
|
33
|
1045
|
|
1046 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
60
|
1047
|
|
1048 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
1049
|
33
|
1050 interval = intervalReads
|
|
1051 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1052 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1053
|
|
1054 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1055 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1056 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1057 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1058
|
|
1059 one = triplets[triplets$Sample == "16278_Left",]
|
|
1060 two = triplets[triplets$Sample == "26402_Left",]
|
|
1061 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1062 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1063
|
|
1064 one = triplets[triplets$Sample == "16278_Right",]
|
|
1065 two = triplets[triplets$Sample == "26402_Right",]
|
|
1066 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1067 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1068
|
60
|
1069 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1070
|
33
|
1071 interval = intervalFreq
|
|
1072 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1073 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1074
|
|
1075 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1076 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1077 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1078 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1079
|
|
1080 one = triplets[triplets$Sample == "16278_Left",]
|
|
1081 two = triplets[triplets$Sample == "26402_Left",]
|
|
1082 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1083 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1084
|
|
1085 one = triplets[triplets$Sample == "16278_Right",]
|
|
1086 two = triplets[triplets$Sample == "26402_Right",]
|
|
1087 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1088 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1089 } else {
|
|
1090 cat("", file="triplets.txt")
|
|
1091 }
|
68
|
1092 cat("</table></html>", file=logfile, append=T)
|