0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
22
|
57 patients = split(dat, dat$Patient, drop=T)
|
9
|
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
63 Titles = factor(Titles, levels=Titles)
|
|
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
65
|
29
|
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
67
|
51
|
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
69 patient.merge.list.second = list()
|
56
|
70 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
71 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
0
|
72 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
73 if (!is.data.frame(x) & is.list(x)){
|
|
74 x = x[[1]]
|
|
75 }
|
34
|
76 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
77 x = data.frame(x,stringsAsFactors=F)
|
0
|
78 onShort = "reads"
|
|
79 if(on == "Frequency"){
|
|
80 onShort = "freq"
|
|
81 }
|
18
|
82 onx = paste(on, ".x", sep="")
|
|
83 ony = paste(on, ".y", sep="")
|
0
|
84 splt = split(x, x$Sample, drop=T)
|
4
|
85 type="pair"
|
2
|
86 if(length(splt) == 1){
|
3
|
87 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
88 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
89 type="single"
|
2
|
90 }
|
0
|
91 patient1 = splt[[1]]
|
|
92 patient2 = splt[[2]]
|
|
93
|
|
94 threshholdIndex = which(colnames(product) == "interval")
|
|
95 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
96 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
97 titleIndex = which(colnames(product) == "Titles")
|
|
98 sampleIndex = which(colnames(x) == "Sample")
|
|
99 patientIndex = which(colnames(x) == "Patient")
|
|
100 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
101 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
102 patient = paste(x[1,patientIndex])
|
3
|
103
|
0
|
104 switched = F
|
|
105 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
106 tmp = twoSample
|
|
107 twoSample = oneSample
|
|
108 oneSample = tmp
|
|
109 tmp = patient1
|
|
110 patient1 = patient2
|
|
111 patient2 = tmp
|
|
112 switched = T
|
|
113 }
|
|
114 if(appendtxt){
|
4
|
115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
116 }
|
51
|
117 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
118
|
29
|
119 if(mergeOn == "Clone_Sequence"){
|
|
120 patient1$merge = paste(patient1$Clone_Sequence)
|
|
121 patient2$merge = paste(patient2$Clone_Sequence)
|
|
122 } else {
|
|
123 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
124 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
125 }
|
9
|
126
|
30
|
127 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
128 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
129 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
130 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
131 scatterplot_data$on = onShort
|
|
132
|
49
|
133 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
134 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
135
|
|
136 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
137
|
51
|
138 start.time = proc.time()
|
|
139 merge.list = c()
|
|
140
|
|
141 if(patient %in% names(patient.merge.list)){
|
|
142 patientMerge = patient.merge.list[[patient]]
|
|
143 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
144 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
145
|
|
146 print(names(patient.merge.list))
|
|
147 } else {
|
|
148 #fuzzy matching here...
|
49
|
149 #merge.list = patientMerge$merge
|
51
|
150
|
49
|
151 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
152 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
153
|
|
154 patient1.fuzzy = patient1
|
|
155 patient2.fuzzy = patient2
|
|
156
|
36
|
157 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
158 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
159
|
48
|
160 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
161 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
162
|
48
|
163 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
164 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
165
|
49
|
166 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
167 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
168
|
49
|
169 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
171
|
37
|
172 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
173 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
174
|
|
175 merge.list = list()
|
|
176
|
|
177 merge.list[["second"]] = vector()
|
|
178
|
37
|
179 while(nrow(patient.fuzzy) > 1){
|
|
180 first.merge = patient.fuzzy[1,"merge"]
|
|
181 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
182 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
183 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
184
|
49
|
185 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
186
|
49
|
187 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
188 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
189
|
|
190 #first match same sample, sum to a single row, same for other sample
|
|
191 #then merge rows like 'normal'
|
51
|
192
|
48
|
193 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
194
|
|
195
|
|
196
|
44
|
197 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
198 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
199 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
200
|
|
201 first.rows = patient.fuzzy[first.match.filter,]
|
|
202 second.rows = patient.fuzzy[second.match.filter,]
|
|
203
|
50
|
204 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
205 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
206 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
207 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
208
|
49
|
209 first.sum = data.frame(merge = first.clone.sequence,
|
|
210 Patient = patient,
|
|
211 Receptor = first.rows[1,"Receptor"],
|
|
212 Sample = first.rows[1,"Sample"],
|
|
213 Cell_Count = first.rows[1,"Cell_Count"],
|
|
214 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
215 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
216 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
217 dsPerM = sum(first.rows$dsPerM),
|
50
|
218 J_Segment_Major_Gene = first.rows.j,
|
|
219 V_Segment_Major_Gene = first.rows.v,
|
49
|
220 Clone_Sequence = first.clone.sequence,
|
|
221 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
222 Related_to_leukemia_clone = F,
|
|
223 Frequency = sum(first.rows$Frequency),
|
|
224 locus_V = first.rows[1,"locus_V"],
|
|
225 locus_J = first.rows[1,"locus_J"],
|
|
226 min_cell_count = first.rows[1,"min_cell_count"],
|
|
227 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
228 paste = first.rows[1,"paste"],
|
|
229 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
230
|
|
231 if(nrow(second.rows) > 0){
|
50
|
232 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
233 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
234 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
235 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
236
|
55
|
237 second.sum = data.frame(merge = first.clone.sequence,
|
49
|
238 Patient = patient,
|
|
239 Receptor = second.rows[1,"Receptor"],
|
|
240 Sample = second.rows[1,"Sample"],
|
|
241 Cell_Count = second.rows[1,"Cell_Count"],
|
|
242 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
243 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
244 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
245 dsPerM = sum(second.rows$dsPerM),
|
50
|
246 J_Segment_Major_Gene = second.rows.j,
|
|
247 V_Segment_Major_Gene = second.rows.v,
|
49
|
248 Clone_Sequence = first.clone.sequence,
|
|
249 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
250 Related_to_leukemia_clone = F,
|
|
251 Frequency = sum(second.rows$Frequency),
|
|
252 locus_V = second.rows[1,"locus_V"],
|
|
253 locus_J = second.rows[1,"locus_J"],
|
|
254 min_cell_count = second.rows[1,"min_cell_count"],
|
|
255 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
256 paste = second.rows[1,"paste"],
|
|
257 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
258
|
|
259 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
260 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
261
|
50
|
262 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
263 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
264
|
56
|
265 tmp.rows = rbind(first.rows, second.rows)
|
|
266 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
267
|
|
268 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
269 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
270 } else {
|
|
271 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
272 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
273 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
274 } else {
|
58
|
275 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
49
|
276 }
|
|
277 }
|
|
278
|
59
|
279 } else if(nrow(first.rows) > 1) {
|
|
280 if(patient1[1,"Sample"] == first.sample){
|
|
281 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
282 patient1 = rbind(patient1, first.sum)
|
|
283 } else {
|
|
284 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
285 patient2 = rbind(patient2, first.sum)
|
|
286 }
|
|
287
|
|
288 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
289 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
290
|
|
291 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
292
|
|
293 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
37
|
294 } else {
|
|
295 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
296 }
|
|
297 }
|
51
|
298 patient.merge.list[[patient]] <<- patientMerge
|
|
299 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
300 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
301 }
|
51
|
302
|
52
|
303 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
304 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
305
|
37
|
306
|
18
|
307 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
308 res1 = vector()
|
|
309 res2 = vector()
|
|
310 resBoth = vector()
|
|
311 read1Count = vector()
|
|
312 read2Count = vector()
|
2
|
313 locussum1 = vector()
|
|
314 locussum2 = vector()
|
9
|
315
|
0
|
316 #for(iter in 1){
|
|
317 for(iter in 1:length(product[,1])){
|
|
318 threshhold = product[iter,threshholdIndex]
|
|
319 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
320 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
321 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
322 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
323 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
324 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
325 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
326 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
327 res1 = append(res1, sum(one))
|
2
|
328 res2 = append(res2, sum(two))
|
0
|
329 resBoth = append(resBoth, sum(both))
|
2
|
330 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
331 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
332 #threshhold = 0
|
|
333 if(threshhold != 0){
|
|
334 if(sum(one) > 0){
|
15
|
335 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
336 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
337 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
338 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
339 }
|
|
340 if(sum(two) > 0){
|
15
|
341 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
342 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
343 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
344 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
345 }
|
29
|
346 } else {
|
|
347 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
348 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
|
349 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
350 if(nrow(scatterplot_locus_data) > 0){
|
|
351 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
352 }
|
34
|
353 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
354 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
355 if(any(in_two)){
|
49
|
356 scatterplot_locus_data[in_two,]$type = twoSample
|
34
|
357 }
|
49
|
358 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
359 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
360 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
30
|
361 if(any(in_both)){
|
|
362 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
363 }
|
29
|
364 if(type == "single"){
|
|
365 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
366 }
|
|
367 p = NULL
|
|
368 if(nrow(scatterplot_locus_data) != 0){
|
|
369 if(on == "normalized_read_count"){
|
30
|
370 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
49
|
371 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
372 } else {
|
49
|
373 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
374 }
|
|
375 p = p + geom_point(aes(colour=type), position="jitter")
|
|
376 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
377 } else {
|
|
378 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
379 }
|
|
380 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
381 print(p)
|
|
382 dev.off()
|
0
|
383 }
|
|
384 if(sum(both) > 0){
|
15
|
385 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
386 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
387 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
388 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
389 }
|
0
|
390 }
|
2
|
391 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
392 if(sum(is.na(patientResult$percentage)) > 0){
|
|
393 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
394 }
|
|
395 colnames(patientResult)[6] = oneSample
|
|
396 colnames(patientResult)[8] = twoSample
|
|
397 colnamesBak = colnames(patientResult)
|
2
|
398 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
399 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
400 colnames(patientResult) = colnamesBak
|
|
401
|
|
402 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
403 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
404
|
|
405 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
406 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
407 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
408 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
409 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
410 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
411 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
412 print(plt)
|
|
413 dev.off()
|
|
414 #(t,r,b,l)
|
|
415 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
416 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
417 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
418 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
419 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
420 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
421 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
422 print(plt)
|
|
423 dev.off()
|
|
424
|
|
425 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
426 patientResult$relativeValue = patientResult$value * 10
|
|
427 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
428 plt = ggplot(patientResult)
|
|
429 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
430 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
431 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
432 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
433 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
434 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
435 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
436 print(plt)
|
|
437 dev.off()
|
|
438 }
|
|
439
|
3
|
440 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
441
|
0
|
442 interval = intervalFreq
|
|
443 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
444 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
445 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
446
|
3
|
447 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
448
|
0
|
449 interval = intervalReads
|
|
450 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
451 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
452 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
33
|
453
|
|
454 if(nrow(single_patients) > 0){
|
|
455 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
456 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
457 p = p + geom_point(aes(colour=type), position="jitter")
|
|
458 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
459 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
460 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
461 print(p)
|
|
462 dev.off()
|
7
|
463
|
33
|
464 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
465 p = p + geom_point(aes(colour=type), position="jitter")
|
|
466 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
467 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
468 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
469 print(p)
|
|
470 dev.off()
|
|
471 } else {
|
|
472 empty <- data.frame()
|
|
473 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
474
|
|
475 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
476 print(p)
|
|
477 dev.off()
|
|
478
|
|
479 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
480 print(p)
|
|
481 dev.off()
|
|
482 }
|
60
|
483
|
|
484 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
485 patient.merge.list.second = list()
|
|
486
|
7
|
487 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
488 onShort = "reads"
|
|
489 if(on == "Frequency"){
|
|
490 onShort = "freq"
|
|
491 }
|
18
|
492 onx = paste(on, ".x", sep="")
|
|
493 ony = paste(on, ".y", sep="")
|
|
494 onz = paste(on, ".z", sep="")
|
7
|
495 type="triplet"
|
|
496
|
|
497 threshholdIndex = which(colnames(product) == "interval")
|
|
498 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
499 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
500 titleIndex = which(colnames(product) == "Titles")
|
|
501 sampleIndex = which(colnames(patient1) == "Sample")
|
|
502 patientIndex = which(colnames(patient1) == "Patient")
|
|
503 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
504 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
505 threeSample = paste(patient3[1,sampleIndex], sep="")
|
60
|
506
|
29
|
507 if(mergeOn == "Clone_Sequence"){
|
|
508 patient1$merge = paste(patient1$Clone_Sequence)
|
|
509 patient2$merge = paste(patient2$Clone_Sequence)
|
|
510 patient3$merge = paste(patient3$Clone_Sequence)
|
|
511
|
|
512 } else {
|
|
513 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
514 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
515 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
516 }
|
60
|
517
|
|
518 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
519 patient1.fuzzy = patient1
|
|
520 patient2.fuzzy = patient2
|
|
521 patient3.fuzzy = patient3
|
|
522
|
|
523 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
|
524
|
|
525 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
526 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
527 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
|
528
|
|
529 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
65
|
530 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
60
|
531
|
|
532 other.sample.list = list()
|
|
533 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
534 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
535 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
|
536
|
9
|
537 patientMerge = merge(patient1, patient2, by="merge")
|
|
538 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
539 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
60
|
540 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
541 patientMerge = patientMerge[NULL,]
|
|
542
|
|
543 duo.merge.list = list()
|
|
544
|
20
|
545 patientMerge12 = merge(patient1, patient2, by="merge")
|
60
|
546 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
547 patientMerge12 = patientMerge12[NULL,]
|
|
548 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
549 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
|
550
|
20
|
551 patientMerge13 = merge(patient1, patient3, by="merge")
|
60
|
552 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
553 patientMerge13 = patientMerge13[NULL,]
|
|
554 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
555 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
|
556
|
20
|
557 patientMerge23 = merge(patient2, patient3, by="merge")
|
60
|
558 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
559 patientMerge23 = patientMerge23[NULL,]
|
|
560 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
561 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
|
562
|
|
563 merge.list = list()
|
|
564 merge.list[["second"]] = vector()
|
|
565
|
|
566 start.time = proc.time()
|
|
567 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
568 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
569 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
570 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
571 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
|
572
|
|
573 merge.list[["second"]] = patient.merge.list.second[[label1]]
|
|
574
|
|
575 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
576 } else {
|
|
577 while(nrow(patient.fuzzy) > 0){
|
|
578 first.merge = patient.fuzzy[1,"merge"]
|
|
579 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
580 first.sample = patient.fuzzy[1,"Sample"]
|
|
581
|
|
582 merge.filter = first.merge == patient.fuzzy$merge
|
|
583
|
|
584 second.sample = other.sample.list[[first.sample]][1]
|
|
585 third.sample = other.sample.list[[first.sample]][2]
|
|
586
|
|
587 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
588 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
589 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
|
590
|
|
591 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
592
|
|
593 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
594 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
595 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
|
596
|
|
597 matches.in.1 = any(match.filter.1)
|
|
598 matches.in.2 = any(match.filter.2)
|
|
599 matches.in.3 = any(match.filter.3)
|
|
600
|
|
601
|
|
602
|
|
603 rows.1 = patient.fuzzy[match.filter.1,]
|
|
604
|
|
605 sum.1 = data.frame(merge = first.clone.sequence,
|
|
606 Patient = label1,
|
|
607 Receptor = rows.1[1,"Receptor"],
|
|
608 Sample = rows.1[1,"Sample"],
|
|
609 Cell_Count = rows.1[1,"Cell_Count"],
|
|
610 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
611 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
612 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
613 dsPerM = sum(rows.1$dsPerM),
|
|
614 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
615 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
616 Clone_Sequence = first.clone.sequence,
|
|
617 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
618 Related_to_leukemia_clone = F,
|
|
619 Frequency = sum(rows.1$Frequency),
|
|
620 locus_V = rows.1[1,"locus_V"],
|
|
621 locus_J = rows.1[1,"locus_J"],
|
64
|
622 uniqueID = rows.1[1,"uniqueID"],
|
60
|
623 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
624 sum.2 = sum.1[NULL,]
|
|
625 rows.2 = patient.fuzzy[match.filter.2,]
|
|
626 if(matches.in.2){
|
|
627 sum.2 = data.frame(merge = first.clone.sequence,
|
|
628 Patient = label1,
|
|
629 Receptor = rows.2[1,"Receptor"],
|
|
630 Sample = rows.2[1,"Sample"],
|
|
631 Cell_Count = rows.2[1,"Cell_Count"],
|
|
632 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
633 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
634 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
635 dsPerM = sum(rows.2$dsPerM),
|
|
636 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
637 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
638 Clone_Sequence = first.clone.sequence,
|
|
639 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
640 Related_to_leukemia_clone = F,
|
|
641 Frequency = sum(rows.2$Frequency),
|
|
642 locus_V = rows.2[1,"locus_V"],
|
|
643 locus_J = rows.2[1,"locus_J"],
|
64
|
644 uniqueID = rows.2[1,"uniqueID"],
|
60
|
645 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
646 }
|
|
647
|
|
648 sum.3 = sum.1[NULL,]
|
|
649 rows.3 = patient.fuzzy[match.filter.3,]
|
|
650 if(matches.in.3){
|
|
651 sum.3 = data.frame(merge = first.clone.sequence,
|
|
652 Patient = label1,
|
|
653 Receptor = rows.3[1,"Receptor"],
|
|
654 Sample = rows.3[1,"Sample"],
|
|
655 Cell_Count = rows.3[1,"Cell_Count"],
|
|
656 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
657 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
658 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
659 dsPerM = sum(rows.3$dsPerM),
|
|
660 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
661 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
662 Clone_Sequence = first.clone.sequence,
|
|
663 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
664 Related_to_leukemia_clone = F,
|
|
665 Frequency = sum(rows.3$Frequency),
|
|
666 locus_V = rows.3[1,"locus_V"],
|
|
667 locus_J = rows.3[1,"locus_J"],
|
64
|
668 uniqueID = rows.3[1,"uniqueID"],
|
60
|
669 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
670 }
|
|
671
|
|
672 if(matches.in.2 & matches.in.3){
|
|
673 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
674 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
675 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
676 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
|
677
|
|
678 patientMerge = rbind(patientMerge, merge.123)
|
|
679 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
|
680
|
|
681 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
682 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
683
|
|
684 } else if (matches.in.2) {
|
|
685 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
686 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
687
|
|
688 second.sample = sum.2[,"Sample"]
|
|
689
|
|
690 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
691
|
|
692 merge.12 = merge(sum.1, sum.2, by="merge")
|
|
693
|
|
694 current.merge.list = rbind(current.merge.list, merge.12)
|
|
695 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
696
|
|
697 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
|
698
|
|
699 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
700 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
701
|
|
702 } else if (matches.in.3) {
|
|
703
|
|
704 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
705 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
706
|
|
707 second.sample = sum.3[,"Sample"]
|
|
708
|
|
709 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
710
|
|
711 merge.13 = merge(sum.1, sum.3, by="merge")
|
|
712
|
|
713 current.merge.list = rbind(current.merge.list, merge.13)
|
|
714 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
715
|
|
716 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
|
717
|
|
718 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
719 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
720
|
63
|
721 } else if(nrow(rows.1) > 1){
|
|
722 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
64
|
723 print(names(patient1)[names(patient1) %in% sum.1])
|
|
724 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
725 print(names(patient1))
|
|
726 print(names(sum.1))
|
|
727 print(summary(sum.1))
|
|
728 print(summary(patient1))
|
|
729 print(dim(sum.1))
|
|
730 print(dim(patient1))
|
|
731 print(head(sum.1[,names(patient1)]))
|
|
732 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
63
|
733 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
60
|
734 } else {
|
|
735 patient.fuzzy = patient.fuzzy[-1,]
|
|
736 }
|
|
737
|
|
738 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
739 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
740
|
|
741 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
742 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
743 } else {
|
|
744 }
|
|
745
|
|
746 }
|
|
747 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
748
|
|
749 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
750 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
751 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
|
752
|
|
753 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
754 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
755 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
756
|
|
757 patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
758 }
|
|
759 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
760 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
761 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
762 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
20
|
763 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
60
|
764
|
63
|
765 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
766 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
767 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
62
|
768
|
60
|
769 if(F){
|
|
770 patientMerge = merge(patient1, patient2, by="merge")
|
|
771 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
772 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
773 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
774 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
775 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
776 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
777 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
778 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
779 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
780 }
|
28
|
781
|
30
|
782 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
783 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
784 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
785 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
786
|
7
|
787 res1 = vector()
|
|
788 res2 = vector()
|
|
789 res3 = vector()
|
20
|
790 res12 = vector()
|
|
791 res13 = vector()
|
|
792 res23 = vector()
|
7
|
793 resAll = vector()
|
|
794 read1Count = vector()
|
|
795 read2Count = vector()
|
|
796 read3Count = vector()
|
|
797
|
|
798 if(appendTriplets){
|
9
|
799 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
800 }
|
|
801 for(iter in 1:length(product[,1])){
|
|
802 threshhold = product[iter,threshholdIndex]
|
|
803 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
804 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
805 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
806 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
807
|
30
|
808 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
809 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
810 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
811
|
30
|
812 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
813 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
814 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
815
|
18
|
816 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
817 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
818 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
819 res1 = append(res1, sum(one))
|
|
820 res2 = append(res2, sum(two))
|
|
821 res3 = append(res3, sum(three))
|
|
822 resAll = append(resAll, sum(all))
|
20
|
823 res12 = append(res12, sum(one_two))
|
|
824 res13 = append(res13, sum(one_three))
|
|
825 res23 = append(res23, sum(two_three))
|
7
|
826 #threshhold = 0
|
|
827 if(threshhold != 0){
|
|
828 if(sum(one) > 0){
|
20
|
829 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
830 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
831 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
832 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
833 }
|
|
834 if(sum(two) > 0){
|
20
|
835 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
836 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
837 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
838 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
839 }
|
|
840 if(sum(three) > 0){
|
20
|
841 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
842 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
843 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
844 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
845 }
|
20
|
846 if(sum(one_two) > 0){
|
|
847 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
848 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
849 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
850 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
851 }
|
|
852 if(sum(one_three) > 0){
|
|
853 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
854 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
855 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
856 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
857 }
|
|
858 if(sum(two_three) > 0){
|
|
859 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
860 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
861 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
862 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
863 }
|
|
864 } else { #scatterplot data
|
24
|
865 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
60
|
866 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
30
|
867 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
868 if(sum(in_two) > 0){
|
|
869 scatterplot_locus_data[in_two,]$type = "In two"
|
|
870 }
|
30
|
871 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
872 if(sum(in_three)> 0){
|
|
873 scatterplot_locus_data[in_three,]$type = "In three"
|
|
874 }
|
|
875 not_in_one = scatterplot_locus_data$type != "In one"
|
|
876 if(sum(not_in_one) > 0){
|
|
877 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
878 }
|
20
|
879 p = NULL
|
|
880 if(nrow(scatterplot_locus_data) != 0){
|
|
881 if(on == "normalized_read_count"){
|
31
|
882 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
883 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
884 } else {
|
32
|
885 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
886 }
|
|
887 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
888 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
889 } else {
|
|
890 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
891 }
|
|
892 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
893 print(p)
|
|
894 dev.off()
|
|
895 }
|
7
|
896 if(sum(all) > 0){
|
20
|
897 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
898 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
899 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
900 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
901 }
|
|
902 }
|
20
|
903 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
904 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
905 colnames(patientResult)[6] = oneSample
|
20
|
906 colnames(patientResult)[7] = twoSample
|
|
907 colnames(patientResult)[8] = threeSample
|
|
908 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
909 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
910 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
911
|
|
912 colnamesBak = colnames(patientResult)
|
20
|
913 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
914 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
915 colnames(patientResult) = colnamesBak
|
|
916
|
|
917 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
918 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
919
|
|
920 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
921 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
922 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
923 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
924 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
925 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
926 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
927 print(plt)
|
|
928 dev.off()
|
|
929
|
|
930 fontSize = 4
|
|
931
|
|
932 bak = patientResult
|
|
933 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
934 patientResult$relativeValue = patientResult$value * 10
|
|
935 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
936 plt = ggplot(patientResult)
|
|
937 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
938 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
939 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
940 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
941 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
942 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
943 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
944 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
945 print(plt)
|
|
946 dev.off()
|
|
947 }
|
|
948
|
33
|
949 if(nrow(triplets) != 0){
|
60
|
950
|
|
951 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
952
|
33
|
953 triplets$uniqueID = "ID"
|
|
954
|
|
955 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
956 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
957 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
958
|
|
959 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
960 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
961 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
962
|
|
963 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
60
|
964
|
|
965 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
966
|
33
|
967 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
968 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
969 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
970
|
|
971 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
972 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
973
|
|
974 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
975
|
|
976 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
977
|
|
978 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
979
|
|
980 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
981
|
60
|
982 column_drops = c("min_cell_count", "min_cell_paste")
|
33
|
983
|
|
984 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
60
|
985
|
|
986 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
987
|
33
|
988 interval = intervalReads
|
|
989 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
990 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
991
|
|
992 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
993 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
994 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
995 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
996
|
|
997 one = triplets[triplets$Sample == "16278_Left",]
|
|
998 two = triplets[triplets$Sample == "26402_Left",]
|
|
999 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1000 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1001
|
|
1002 one = triplets[triplets$Sample == "16278_Right",]
|
|
1003 two = triplets[triplets$Sample == "26402_Right",]
|
|
1004 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1005 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1006
|
60
|
1007 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1008
|
33
|
1009 interval = intervalFreq
|
|
1010 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1011 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1012
|
|
1013 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1014 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1015 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1016 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1017
|
|
1018 one = triplets[triplets$Sample == "16278_Left",]
|
|
1019 two = triplets[triplets$Sample == "26402_Left",]
|
|
1020 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1021 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1022
|
|
1023 one = triplets[triplets$Sample == "16278_Right",]
|
|
1024 two = triplets[triplets$Sample == "26402_Right",]
|
|
1025 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1026 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1027 } else {
|
|
1028 cat("", file="triplets.txt")
|
|
1029 }
|
60
|
1030 cat("</table></html>", file=logfile, append=T) |