0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
68
|
2 options(scipen=999)
|
0
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
3
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
29
|
9 mergeOn = args[6]
|
3
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
12
|
4
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
0
|
18 #require(xtable)
|
3
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
66
|
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
34
|
22 dat$dsPerM = 0
|
9
|
23 dat = dat[!is.na(dat$Patient),]
|
13
|
24 dat$Related_to_leukemia_clone = F
|
9
|
25
|
0
|
26 setwd(outDir)
|
3
|
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
30
|
3
|
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
32
|
13
|
33 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
34
|
3
|
35 dat = dat[dat$Frequency >= min_freq,]
|
|
36
|
13
|
37 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
38
|
|
39 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
40
|
|
41 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
42 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
43 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
44
|
|
45 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
46 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
47
|
|
48 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
49 print(paste("rows:", nrow(dat)))
|
13
|
50 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
51 print(paste("rows:", nrow(dat)))
|
13
|
52 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
53
|
3
|
54 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
55
|
|
56 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
57
|
22
|
58 patients = split(dat, dat$Patient, drop=T)
|
9
|
59 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
60 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
61 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
62 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
63 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
64 Titles = factor(Titles, levels=Titles)
|
|
65 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
66
|
29
|
67 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
68
|
51
|
69 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
70 patient.merge.list.second = list()
|
68
|
71 scatter_locus_data_list = list()
|
56
|
72 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
73 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
0
|
74 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
75 if (!is.data.frame(x) & is.list(x)){
|
|
76 x = x[[1]]
|
|
77 }
|
34
|
78 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
79 x = data.frame(x,stringsAsFactors=F)
|
0
|
80 onShort = "reads"
|
|
81 if(on == "Frequency"){
|
|
82 onShort = "freq"
|
|
83 }
|
18
|
84 onx = paste(on, ".x", sep="")
|
|
85 ony = paste(on, ".y", sep="")
|
0
|
86 splt = split(x, x$Sample, drop=T)
|
4
|
87 type="pair"
|
2
|
88 if(length(splt) == 1){
|
3
|
89 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
90 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
91 type="single"
|
2
|
92 }
|
0
|
93 patient1 = splt[[1]]
|
|
94 patient2 = splt[[2]]
|
|
95
|
|
96 threshholdIndex = which(colnames(product) == "interval")
|
|
97 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
98 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
99 titleIndex = which(colnames(product) == "Titles")
|
|
100 sampleIndex = which(colnames(x) == "Sample")
|
|
101 patientIndex = which(colnames(x) == "Patient")
|
|
102 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
103 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
104 patient = paste(x[1,patientIndex])
|
3
|
105
|
0
|
106 switched = F
|
|
107 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
108 tmp = twoSample
|
|
109 twoSample = oneSample
|
|
110 oneSample = tmp
|
|
111 tmp = patient1
|
|
112 patient1 = patient2
|
|
113 patient2 = tmp
|
|
114 switched = T
|
|
115 }
|
|
116 if(appendtxt){
|
4
|
117 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
118 }
|
51
|
119 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
120
|
29
|
121 if(mergeOn == "Clone_Sequence"){
|
|
122 patient1$merge = paste(patient1$Clone_Sequence)
|
|
123 patient2$merge = paste(patient2$Clone_Sequence)
|
|
124 } else {
|
|
125 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
126 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
127 }
|
9
|
128
|
30
|
129 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
68
|
130 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
|
131 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
132 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
133 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
|
134 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
135 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
|
|
136 scatterplot_data$type = character(0)
|
|
137 scatterplot_data$link = numeric(0)
|
|
138 scatterplot_data$on = character(0)
|
29
|
139
|
49
|
140 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
141 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
142
|
|
143 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
144
|
51
|
145 start.time = proc.time()
|
|
146 merge.list = c()
|
|
147
|
|
148 if(patient %in% names(patient.merge.list)){
|
|
149 patientMerge = patient.merge.list[[patient]]
|
|
150 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
68
|
151 scatterplot_data = scatter_locus_data_list[[patient]]
|
51
|
152 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
153
|
|
154 print(names(patient.merge.list))
|
|
155 } else {
|
|
156 #fuzzy matching here...
|
49
|
157 #merge.list = patientMerge$merge
|
51
|
158
|
49
|
159 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
160 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
161
|
|
162 patient1.fuzzy = patient1
|
|
163 patient2.fuzzy = patient2
|
|
164
|
36
|
165 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
166 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
167
|
48
|
168 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
169 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
170
|
48
|
171 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
172 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
173
|
49
|
174 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
175 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
176
|
49
|
177 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
178 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
179
|
37
|
180 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
181 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
182
|
|
183 merge.list = list()
|
|
184
|
|
185 merge.list[["second"]] = vector()
|
68
|
186
|
|
187 link.count = 1
|
|
188
|
37
|
189 while(nrow(patient.fuzzy) > 1){
|
|
190 first.merge = patient.fuzzy[1,"merge"]
|
|
191 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
192 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
193 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
194
|
49
|
195 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
196
|
49
|
197 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
198 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
199
|
|
200 #first match same sample, sum to a single row, same for other sample
|
|
201 #then merge rows like 'normal'
|
51
|
202
|
48
|
203 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
204
|
|
205
|
|
206
|
44
|
207 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
208 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
209 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
210
|
|
211 first.rows = patient.fuzzy[first.match.filter,]
|
|
212 second.rows = patient.fuzzy[second.match.filter,]
|
|
213
|
50
|
214 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
215 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
216 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
217 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
218
|
49
|
219 first.sum = data.frame(merge = first.clone.sequence,
|
|
220 Patient = patient,
|
|
221 Receptor = first.rows[1,"Receptor"],
|
|
222 Sample = first.rows[1,"Sample"],
|
|
223 Cell_Count = first.rows[1,"Cell_Count"],
|
|
224 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
225 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
226 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
227 dsPerM = sum(first.rows$dsPerM),
|
50
|
228 J_Segment_Major_Gene = first.rows.j,
|
|
229 V_Segment_Major_Gene = first.rows.v,
|
49
|
230 Clone_Sequence = first.clone.sequence,
|
|
231 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
232 Related_to_leukemia_clone = F,
|
|
233 Frequency = sum(first.rows$Frequency),
|
|
234 locus_V = first.rows[1,"locus_V"],
|
|
235 locus_J = first.rows[1,"locus_J"],
|
|
236 min_cell_count = first.rows[1,"min_cell_count"],
|
|
237 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
238 paste = first.rows[1,"paste"],
|
|
239 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
240
|
|
241 if(nrow(second.rows) > 0){
|
50
|
242 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
243 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
244 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
245 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
246
|
55
|
247 second.sum = data.frame(merge = first.clone.sequence,
|
49
|
248 Patient = patient,
|
|
249 Receptor = second.rows[1,"Receptor"],
|
|
250 Sample = second.rows[1,"Sample"],
|
|
251 Cell_Count = second.rows[1,"Cell_Count"],
|
|
252 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
253 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
254 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
255 dsPerM = sum(second.rows$dsPerM),
|
50
|
256 J_Segment_Major_Gene = second.rows.j,
|
|
257 V_Segment_Major_Gene = second.rows.v,
|
49
|
258 Clone_Sequence = first.clone.sequence,
|
|
259 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
260 Related_to_leukemia_clone = F,
|
|
261 Frequency = sum(second.rows$Frequency),
|
|
262 locus_V = second.rows[1,"locus_V"],
|
|
263 locus_J = second.rows[1,"locus_J"],
|
|
264 min_cell_count = second.rows[1,"min_cell_count"],
|
|
265 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
266 paste = second.rows[1,"paste"],
|
|
267 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
268
|
|
269 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
270 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
271
|
50
|
272 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
273 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
274
|
56
|
275 tmp.rows = rbind(first.rows, second.rows)
|
|
276 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
68
|
277
|
|
278
|
|
279 #add to the scatterplot data
|
|
280 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
281 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
282 scatterplot.row$link = link.count
|
|
283 scatterplot.row$on = onShort
|
|
284
|
|
285 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
286
|
|
287 scatterplot.row = second.sum[,scatterplot_data_columns]
|
|
288 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
289 scatterplot.row$link = link.count
|
|
290 scatterplot.row$on = onShort
|
|
291
|
|
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
293
|
|
294 #write some information about the match to a log file
|
56
|
295 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
296 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
297 } else {
|
|
298 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
299 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
300 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
301 } else {
|
58
|
302 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
49
|
303 }
|
|
304 }
|
|
305
|
59
|
306 } else if(nrow(first.rows) > 1) {
|
|
307 if(patient1[1,"Sample"] == first.sample){
|
|
308 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
309 patient1 = rbind(patient1, first.sum)
|
|
310 } else {
|
|
311 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
312 patient2 = rbind(patient2, first.sum)
|
|
313 }
|
|
314
|
|
315 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
316 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
317
|
|
318 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
68
|
319
|
|
320 #add to the scatterplot data
|
|
321 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
322 scatterplot.row$type = first.sum[,"Sample"]
|
|
323 scatterplot.row$link = link.count
|
|
324 scatterplot.row$on = onShort
|
|
325
|
|
326 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
59
|
327
|
|
328 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
37
|
329 } else {
|
|
330 patient.fuzzy = patient.fuzzy[-1,]
|
68
|
331
|
|
332 #add to the scatterplot data
|
|
333 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
334 scatterplot.row$type = first.sum[,"Sample"]
|
|
335 scatterplot.row$link = link.count
|
|
336 scatterplot.row$on = onShort
|
|
337
|
|
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
34
|
339 }
|
68
|
340 link.count = link.count + 1
|
34
|
341 }
|
51
|
342 patient.merge.list[[patient]] <<- patientMerge
|
|
343 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
69
|
344
|
|
345 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
346 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
347
|
68
|
348 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
51
|
349 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
350 }
|
51
|
351
|
52
|
352 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
353 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
354
|
37
|
355
|
18
|
356 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
68
|
357 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
0
|
358 res1 = vector()
|
|
359 res2 = vector()
|
|
360 resBoth = vector()
|
|
361 read1Count = vector()
|
|
362 read2Count = vector()
|
2
|
363 locussum1 = vector()
|
|
364 locussum2 = vector()
|
9
|
365
|
0
|
366 #for(iter in 1){
|
|
367 for(iter in 1:length(product[,1])){
|
|
368 threshhold = product[iter,threshholdIndex]
|
|
369 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
370 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
371 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
372 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
373 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
374 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
375 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
376 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
377 res1 = append(res1, sum(one))
|
2
|
378 res2 = append(res2, sum(two))
|
0
|
379 resBoth = append(resBoth, sum(both))
|
2
|
380 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
381 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
382 #threshhold = 0
|
|
383 if(threshhold != 0){
|
|
384 if(sum(one) > 0){
|
15
|
385 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
386 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
387 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
388 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
389 }
|
|
390 if(sum(two) > 0){
|
15
|
391 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
392 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
393 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
394 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
395 }
|
29
|
396 } else {
|
|
397 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
398 if(nrow(scatterplot_locus_data) > 0){
|
|
399 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
400 }
|
68
|
401
|
|
402
|
69
|
403
|
29
|
404 p = NULL
|
68
|
405 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
29
|
406 if(nrow(scatterplot_locus_data) != 0){
|
|
407 if(on == "normalized_read_count"){
|
30
|
408 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
69
|
409 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
410 } else {
|
69
|
411 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
412 }
|
68
|
413 p = p + geom_point(aes(colour=type), position="dodge")
|
29
|
414 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
415 } else {
|
|
416 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
417 }
|
|
418 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
419 print(p)
|
|
420 dev.off()
|
0
|
421 }
|
|
422 if(sum(both) > 0){
|
15
|
423 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
424 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
425 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
426 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
427 }
|
0
|
428 }
|
2
|
429 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
430 if(sum(is.na(patientResult$percentage)) > 0){
|
|
431 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
432 }
|
|
433 colnames(patientResult)[6] = oneSample
|
|
434 colnames(patientResult)[8] = twoSample
|
|
435 colnamesBak = colnames(patientResult)
|
2
|
436 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
437 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
438 colnames(patientResult) = colnamesBak
|
|
439
|
|
440 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
441 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
442
|
|
443 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
444 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
445 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
446 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
447 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
448 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
449 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
450 print(plt)
|
|
451 dev.off()
|
|
452 #(t,r,b,l)
|
|
453 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
454 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
455 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
456 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
457 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
458 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
459 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
460 print(plt)
|
|
461 dev.off()
|
|
462
|
|
463 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
464 patientResult$relativeValue = patientResult$value * 10
|
|
465 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
466 plt = ggplot(patientResult)
|
|
467 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
468 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
469 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
470 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
471 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
472 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
473 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
474 print(plt)
|
|
475 dev.off()
|
|
476 }
|
|
477
|
3
|
478 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
479
|
0
|
480 interval = intervalFreq
|
|
481 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
482 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
483 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
484
|
3
|
485 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
486
|
0
|
487 interval = intervalReads
|
|
488 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
489 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
490 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
33
|
491
|
|
492 if(nrow(single_patients) > 0){
|
|
493 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
68
|
494 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
33
|
495 p = p + geom_point(aes(colour=type), position="jitter")
|
|
496 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
497 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
498 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
499 print(p)
|
|
500 dev.off()
|
7
|
501
|
68
|
502 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
503 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
33
|
504 p = p + geom_point(aes(colour=type), position="jitter")
|
|
505 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
506 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
507 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
508 print(p)
|
|
509 dev.off()
|
|
510 } else {
|
|
511 empty <- data.frame()
|
|
512 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
513
|
|
514 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
515 print(p)
|
|
516 dev.off()
|
|
517
|
|
518 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
519 print(p)
|
|
520 dev.off()
|
|
521 }
|
60
|
522
|
|
523 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
524 patient.merge.list.second = list()
|
|
525
|
7
|
526 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
527 onShort = "reads"
|
|
528 if(on == "Frequency"){
|
|
529 onShort = "freq"
|
|
530 }
|
18
|
531 onx = paste(on, ".x", sep="")
|
|
532 ony = paste(on, ".y", sep="")
|
|
533 onz = paste(on, ".z", sep="")
|
7
|
534 type="triplet"
|
|
535
|
|
536 threshholdIndex = which(colnames(product) == "interval")
|
|
537 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
538 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
539 titleIndex = which(colnames(product) == "Titles")
|
|
540 sampleIndex = which(colnames(patient1) == "Sample")
|
|
541 patientIndex = which(colnames(patient1) == "Patient")
|
|
542 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
543 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
544 threeSample = paste(patient3[1,sampleIndex], sep="")
|
60
|
545
|
29
|
546 if(mergeOn == "Clone_Sequence"){
|
|
547 patient1$merge = paste(patient1$Clone_Sequence)
|
|
548 patient2$merge = paste(patient2$Clone_Sequence)
|
|
549 patient3$merge = paste(patient3$Clone_Sequence)
|
|
550
|
|
551 } else {
|
|
552 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
553 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
554 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
555 }
|
60
|
556
|
|
557 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
558 patient1.fuzzy = patient1
|
|
559 patient2.fuzzy = patient2
|
|
560 patient3.fuzzy = patient3
|
|
561
|
|
562 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
|
563
|
|
564 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
565 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
566 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
|
567
|
|
568 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
65
|
569 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
60
|
570
|
|
571 other.sample.list = list()
|
|
572 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
573 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
574 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
|
575
|
9
|
576 patientMerge = merge(patient1, patient2, by="merge")
|
|
577 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
578 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
60
|
579 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
580 patientMerge = patientMerge[NULL,]
|
|
581
|
|
582 duo.merge.list = list()
|
|
583
|
20
|
584 patientMerge12 = merge(patient1, patient2, by="merge")
|
60
|
585 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
586 patientMerge12 = patientMerge12[NULL,]
|
|
587 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
588 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
|
589
|
20
|
590 patientMerge13 = merge(patient1, patient3, by="merge")
|
60
|
591 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
592 patientMerge13 = patientMerge13[NULL,]
|
|
593 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
594 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
|
595
|
20
|
596 patientMerge23 = merge(patient2, patient3, by="merge")
|
60
|
597 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
598 patientMerge23 = patientMerge23[NULL,]
|
|
599 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
600 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
|
601
|
|
602 merge.list = list()
|
|
603 merge.list[["second"]] = vector()
|
|
604
|
|
605 start.time = proc.time()
|
|
606 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
607 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
608 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
609 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
610 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
|
611
|
|
612 merge.list[["second"]] = patient.merge.list.second[[label1]]
|
|
613
|
|
614 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
615 } else {
|
|
616 while(nrow(patient.fuzzy) > 0){
|
|
617 first.merge = patient.fuzzy[1,"merge"]
|
|
618 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
619 first.sample = patient.fuzzy[1,"Sample"]
|
|
620
|
|
621 merge.filter = first.merge == patient.fuzzy$merge
|
|
622
|
|
623 second.sample = other.sample.list[[first.sample]][1]
|
|
624 third.sample = other.sample.list[[first.sample]][2]
|
|
625
|
|
626 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
627 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
628 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
|
629
|
|
630 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
631
|
|
632 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
633 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
634 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
|
635
|
|
636 matches.in.1 = any(match.filter.1)
|
|
637 matches.in.2 = any(match.filter.2)
|
|
638 matches.in.3 = any(match.filter.3)
|
|
639
|
|
640
|
|
641
|
|
642 rows.1 = patient.fuzzy[match.filter.1,]
|
|
643
|
|
644 sum.1 = data.frame(merge = first.clone.sequence,
|
|
645 Patient = label1,
|
|
646 Receptor = rows.1[1,"Receptor"],
|
|
647 Sample = rows.1[1,"Sample"],
|
|
648 Cell_Count = rows.1[1,"Cell_Count"],
|
|
649 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
650 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
651 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
652 dsPerM = sum(rows.1$dsPerM),
|
|
653 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
654 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
655 Clone_Sequence = first.clone.sequence,
|
|
656 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
657 Related_to_leukemia_clone = F,
|
|
658 Frequency = sum(rows.1$Frequency),
|
|
659 locus_V = rows.1[1,"locus_V"],
|
|
660 locus_J = rows.1[1,"locus_J"],
|
64
|
661 uniqueID = rows.1[1,"uniqueID"],
|
60
|
662 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
663 sum.2 = sum.1[NULL,]
|
|
664 rows.2 = patient.fuzzy[match.filter.2,]
|
|
665 if(matches.in.2){
|
|
666 sum.2 = data.frame(merge = first.clone.sequence,
|
|
667 Patient = label1,
|
|
668 Receptor = rows.2[1,"Receptor"],
|
|
669 Sample = rows.2[1,"Sample"],
|
|
670 Cell_Count = rows.2[1,"Cell_Count"],
|
|
671 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
672 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
673 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
674 dsPerM = sum(rows.2$dsPerM),
|
|
675 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
676 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
677 Clone_Sequence = first.clone.sequence,
|
|
678 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
679 Related_to_leukemia_clone = F,
|
|
680 Frequency = sum(rows.2$Frequency),
|
|
681 locus_V = rows.2[1,"locus_V"],
|
|
682 locus_J = rows.2[1,"locus_J"],
|
64
|
683 uniqueID = rows.2[1,"uniqueID"],
|
60
|
684 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
685 }
|
|
686
|
|
687 sum.3 = sum.1[NULL,]
|
|
688 rows.3 = patient.fuzzy[match.filter.3,]
|
|
689 if(matches.in.3){
|
|
690 sum.3 = data.frame(merge = first.clone.sequence,
|
|
691 Patient = label1,
|
|
692 Receptor = rows.3[1,"Receptor"],
|
|
693 Sample = rows.3[1,"Sample"],
|
|
694 Cell_Count = rows.3[1,"Cell_Count"],
|
|
695 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
696 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
697 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
698 dsPerM = sum(rows.3$dsPerM),
|
|
699 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
700 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
701 Clone_Sequence = first.clone.sequence,
|
|
702 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
703 Related_to_leukemia_clone = F,
|
|
704 Frequency = sum(rows.3$Frequency),
|
|
705 locus_V = rows.3[1,"locus_V"],
|
|
706 locus_J = rows.3[1,"locus_J"],
|
64
|
707 uniqueID = rows.3[1,"uniqueID"],
|
60
|
708 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
709 }
|
|
710
|
|
711 if(matches.in.2 & matches.in.3){
|
|
712 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
713 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
714 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
715 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
|
716
|
|
717 patientMerge = rbind(patientMerge, merge.123)
|
|
718 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
|
719
|
|
720 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
721 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
722
|
|
723 } else if (matches.in.2) {
|
|
724 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
725 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
726
|
|
727 second.sample = sum.2[,"Sample"]
|
|
728
|
|
729 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
730
|
|
731 merge.12 = merge(sum.1, sum.2, by="merge")
|
|
732
|
|
733 current.merge.list = rbind(current.merge.list, merge.12)
|
|
734 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
735
|
|
736 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
|
737
|
|
738 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
739 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
740
|
|
741 } else if (matches.in.3) {
|
|
742
|
|
743 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
744 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
745
|
|
746 second.sample = sum.3[,"Sample"]
|
|
747
|
|
748 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
749
|
|
750 merge.13 = merge(sum.1, sum.3, by="merge")
|
|
751
|
|
752 current.merge.list = rbind(current.merge.list, merge.13)
|
|
753 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
754
|
|
755 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
|
756
|
|
757 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
758 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
759
|
63
|
760 } else if(nrow(rows.1) > 1){
|
|
761 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
64
|
762 print(names(patient1)[names(patient1) %in% sum.1])
|
|
763 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
764 print(names(patient1))
|
|
765 print(names(sum.1))
|
|
766 print(summary(sum.1))
|
|
767 print(summary(patient1))
|
|
768 print(dim(sum.1))
|
|
769 print(dim(patient1))
|
|
770 print(head(sum.1[,names(patient1)]))
|
|
771 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
63
|
772 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
60
|
773 } else {
|
|
774 patient.fuzzy = patient.fuzzy[-1,]
|
|
775 }
|
|
776
|
|
777 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
778 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
779
|
|
780 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
781 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
782 } else {
|
|
783 }
|
|
784
|
|
785 }
|
|
786 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
787
|
|
788 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
789 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
790 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
|
791
|
|
792 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
793 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
794 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
795
|
|
796 patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
797 }
|
|
798 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
799 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
800 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
801 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
20
|
802 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
60
|
803
|
68
|
804 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
805 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
806 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
807 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
|
808
|
63
|
809 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
810 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
811 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
62
|
812
|
60
|
813 if(F){
|
|
814 patientMerge = merge(patient1, patient2, by="merge")
|
|
815 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
816 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
817 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
818 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
819 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
820 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
821 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
822 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
823 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
824 }
|
28
|
825
|
30
|
826 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
827 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
828 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
829 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
830
|
7
|
831 res1 = vector()
|
|
832 res2 = vector()
|
|
833 res3 = vector()
|
20
|
834 res12 = vector()
|
|
835 res13 = vector()
|
|
836 res23 = vector()
|
7
|
837 resAll = vector()
|
|
838 read1Count = vector()
|
|
839 read2Count = vector()
|
|
840 read3Count = vector()
|
|
841
|
|
842 if(appendTriplets){
|
9
|
843 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
844 }
|
|
845 for(iter in 1:length(product[,1])){
|
|
846 threshhold = product[iter,threshholdIndex]
|
|
847 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
848 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
849 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
850 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
851
|
30
|
852 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
853 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
854 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
855
|
30
|
856 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
857 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
858 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
859
|
18
|
860 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
861 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
862 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
863 res1 = append(res1, sum(one))
|
|
864 res2 = append(res2, sum(two))
|
|
865 res3 = append(res3, sum(three))
|
|
866 resAll = append(resAll, sum(all))
|
20
|
867 res12 = append(res12, sum(one_two))
|
|
868 res13 = append(res13, sum(one_three))
|
|
869 res23 = append(res23, sum(two_three))
|
7
|
870 #threshhold = 0
|
|
871 if(threshhold != 0){
|
|
872 if(sum(one) > 0){
|
20
|
873 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
874 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
875 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
876 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
877 }
|
|
878 if(sum(two) > 0){
|
20
|
879 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
880 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
881 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
882 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
883 }
|
|
884 if(sum(three) > 0){
|
20
|
885 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
886 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
887 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
888 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
889 }
|
20
|
890 if(sum(one_two) > 0){
|
|
891 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
892 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
893 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
894 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
895 }
|
|
896 if(sum(one_three) > 0){
|
|
897 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
898 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
899 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
900 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
901 }
|
|
902 if(sum(two_three) > 0){
|
|
903 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
904 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
905 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
906 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
907 }
|
|
908 } else { #scatterplot data
|
24
|
909 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
60
|
910 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
30
|
911 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
912 if(sum(in_two) > 0){
|
|
913 scatterplot_locus_data[in_two,]$type = "In two"
|
|
914 }
|
30
|
915 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
916 if(sum(in_three)> 0){
|
|
917 scatterplot_locus_data[in_three,]$type = "In three"
|
|
918 }
|
|
919 not_in_one = scatterplot_locus_data$type != "In one"
|
|
920 if(sum(not_in_one) > 0){
|
|
921 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
922 }
|
20
|
923 p = NULL
|
|
924 if(nrow(scatterplot_locus_data) != 0){
|
|
925 if(on == "normalized_read_count"){
|
31
|
926 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
927 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
928 } else {
|
68
|
929 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
930 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
931 }
|
|
932 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
933 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
934 } else {
|
|
935 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
936 }
|
|
937 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
938 print(p)
|
|
939 dev.off()
|
|
940 }
|
7
|
941 if(sum(all) > 0){
|
20
|
942 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
943 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
944 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
945 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
946 }
|
|
947 }
|
20
|
948 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
949 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
950 colnames(patientResult)[6] = oneSample
|
20
|
951 colnames(patientResult)[7] = twoSample
|
|
952 colnames(patientResult)[8] = threeSample
|
|
953 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
954 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
955 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
956
|
|
957 colnamesBak = colnames(patientResult)
|
20
|
958 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
959 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
960 colnames(patientResult) = colnamesBak
|
|
961
|
|
962 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
963 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
964
|
|
965 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
966 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
967 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
968 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
969 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
970 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
971 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
972 print(plt)
|
|
973 dev.off()
|
|
974
|
|
975 fontSize = 4
|
|
976
|
|
977 bak = patientResult
|
|
978 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
979 patientResult$relativeValue = patientResult$value * 10
|
|
980 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
981 plt = ggplot(patientResult)
|
|
982 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
983 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
984 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
985 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
986 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
987 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
988 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
989 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
990 print(plt)
|
|
991 dev.off()
|
|
992 }
|
|
993
|
67
|
994 if(F & nrow(triplets) != 0){
|
60
|
995
|
|
996 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
997
|
33
|
998 triplets$uniqueID = "ID"
|
|
999
|
|
1000 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1001 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1002 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1003
|
|
1004 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1005 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1006 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1007
|
|
1008 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
60
|
1009
|
|
1010 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
1011
|
33
|
1012 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1013 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1014 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
1015
|
|
1016 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1017 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
1018
|
|
1019 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1020
|
|
1021 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1022
|
|
1023 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
1024
|
|
1025 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1026
|
60
|
1027 column_drops = c("min_cell_count", "min_cell_paste")
|
33
|
1028
|
|
1029 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
60
|
1030
|
|
1031 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
1032
|
33
|
1033 interval = intervalReads
|
|
1034 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1035 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1036
|
|
1037 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1038 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1039 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1040 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1041
|
|
1042 one = triplets[triplets$Sample == "16278_Left",]
|
|
1043 two = triplets[triplets$Sample == "26402_Left",]
|
|
1044 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1045 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1046
|
|
1047 one = triplets[triplets$Sample == "16278_Right",]
|
|
1048 two = triplets[triplets$Sample == "26402_Right",]
|
|
1049 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1050 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1051
|
60
|
1052 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1053
|
33
|
1054 interval = intervalFreq
|
|
1055 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1056 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1057
|
|
1058 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1059 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1060 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1061 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1062
|
|
1063 one = triplets[triplets$Sample == "16278_Left",]
|
|
1064 two = triplets[triplets$Sample == "26402_Left",]
|
|
1065 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1066 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1067
|
|
1068 one = triplets[triplets$Sample == "16278_Right",]
|
|
1069 two = triplets[triplets$Sample == "26402_Right",]
|
|
1070 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1071 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1072 } else {
|
|
1073 cat("", file="triplets.txt")
|
|
1074 }
|
68
|
1075 cat("</table></html>", file=logfile, append=T)
|