0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
22
|
57 patients = split(dat, dat$Patient, drop=T)
|
9
|
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
63 Titles = factor(Titles, levels=Titles)
|
|
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
65
|
29
|
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
67
|
51
|
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
69 patient.merge.list.second = list()
|
56
|
70 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
71 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
0
|
72 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
73 if (!is.data.frame(x) & is.list(x)){
|
|
74 x = x[[1]]
|
|
75 }
|
34
|
76 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
77 x = data.frame(x,stringsAsFactors=F)
|
0
|
78 onShort = "reads"
|
|
79 if(on == "Frequency"){
|
|
80 onShort = "freq"
|
|
81 }
|
18
|
82 onx = paste(on, ".x", sep="")
|
|
83 ony = paste(on, ".y", sep="")
|
0
|
84 splt = split(x, x$Sample, drop=T)
|
4
|
85 type="pair"
|
2
|
86 if(length(splt) == 1){
|
3
|
87 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
88 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
89 type="single"
|
2
|
90 }
|
0
|
91 patient1 = splt[[1]]
|
|
92 patient2 = splt[[2]]
|
|
93
|
|
94 threshholdIndex = which(colnames(product) == "interval")
|
|
95 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
96 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
97 titleIndex = which(colnames(product) == "Titles")
|
|
98 sampleIndex = which(colnames(x) == "Sample")
|
|
99 patientIndex = which(colnames(x) == "Patient")
|
|
100 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
101 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
102 patient = paste(x[1,patientIndex])
|
3
|
103
|
0
|
104 switched = F
|
|
105 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
106 tmp = twoSample
|
|
107 twoSample = oneSample
|
|
108 oneSample = tmp
|
|
109 tmp = patient1
|
|
110 patient1 = patient2
|
|
111 patient2 = tmp
|
|
112 switched = T
|
|
113 }
|
|
114 if(appendtxt){
|
4
|
115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
116 }
|
51
|
117 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
118
|
29
|
119 if(mergeOn == "Clone_Sequence"){
|
|
120 patient1$merge = paste(patient1$Clone_Sequence)
|
|
121 patient2$merge = paste(patient2$Clone_Sequence)
|
|
122 } else {
|
|
123 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
124 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
125 }
|
9
|
126
|
30
|
127 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
128 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
129 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
130 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
131 scatterplot_data$on = onShort
|
|
132
|
49
|
133 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
134 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
135
|
|
136 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
137
|
51
|
138 start.time = proc.time()
|
|
139 merge.list = c()
|
|
140
|
|
141 if(patient %in% names(patient.merge.list)){
|
|
142 patientMerge = patient.merge.list[[patient]]
|
|
143 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
144 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
145
|
|
146 print(names(patient.merge.list))
|
|
147 } else {
|
|
148 #fuzzy matching here...
|
49
|
149 #merge.list = patientMerge$merge
|
51
|
150
|
49
|
151 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
152 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
153
|
|
154 patient1.fuzzy = patient1
|
|
155 patient2.fuzzy = patient2
|
|
156
|
36
|
157 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
158 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
159
|
48
|
160 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
161 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
162
|
48
|
163 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
164 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
165
|
49
|
166 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
167 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
168
|
49
|
169 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
171
|
37
|
172 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
173 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
174
|
|
175 merge.list = list()
|
|
176
|
|
177 merge.list[["second"]] = vector()
|
|
178
|
37
|
179 while(nrow(patient.fuzzy) > 1){
|
|
180 first.merge = patient.fuzzy[1,"merge"]
|
|
181 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
182 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
183 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
184
|
49
|
185 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
186
|
49
|
187 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
188 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
189
|
|
190 #first match same sample, sum to a single row, same for other sample
|
|
191 #then merge rows like 'normal'
|
51
|
192
|
48
|
193 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
194
|
|
195
|
|
196
|
44
|
197 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
198 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
199 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
200
|
|
201 first.rows = patient.fuzzy[first.match.filter,]
|
|
202 second.rows = patient.fuzzy[second.match.filter,]
|
|
203
|
50
|
204 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
205 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
206 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
207 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
208
|
49
|
209 first.sum = data.frame(merge = first.clone.sequence,
|
|
210 Patient = patient,
|
|
211 Receptor = first.rows[1,"Receptor"],
|
|
212 Sample = first.rows[1,"Sample"],
|
|
213 Cell_Count = first.rows[1,"Cell_Count"],
|
|
214 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
215 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
216 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
217 dsPerM = sum(first.rows$dsPerM),
|
50
|
218 J_Segment_Major_Gene = first.rows.j,
|
|
219 V_Segment_Major_Gene = first.rows.v,
|
49
|
220 Clone_Sequence = first.clone.sequence,
|
|
221 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
222 Related_to_leukemia_clone = F,
|
|
223 Frequency = sum(first.rows$Frequency),
|
|
224 locus_V = first.rows[1,"locus_V"],
|
|
225 locus_J = first.rows[1,"locus_J"],
|
|
226 min_cell_count = first.rows[1,"min_cell_count"],
|
|
227 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
228 paste = first.rows[1,"paste"],
|
|
229 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
230
|
|
231 if(nrow(second.rows) > 0){
|
50
|
232 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
233 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
234 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
235 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
236
|
55
|
237 second.sum = data.frame(merge = first.clone.sequence,
|
49
|
238 Patient = patient,
|
|
239 Receptor = second.rows[1,"Receptor"],
|
|
240 Sample = second.rows[1,"Sample"],
|
|
241 Cell_Count = second.rows[1,"Cell_Count"],
|
|
242 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
243 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
244 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
245 dsPerM = sum(second.rows$dsPerM),
|
50
|
246 J_Segment_Major_Gene = second.rows.j,
|
|
247 V_Segment_Major_Gene = second.rows.v,
|
49
|
248 Clone_Sequence = first.clone.sequence,
|
|
249 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
250 Related_to_leukemia_clone = F,
|
|
251 Frequency = sum(second.rows$Frequency),
|
|
252 locus_V = second.rows[1,"locus_V"],
|
|
253 locus_J = second.rows[1,"locus_J"],
|
|
254 min_cell_count = second.rows[1,"min_cell_count"],
|
|
255 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
256 paste = second.rows[1,"paste"],
|
|
257 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
258
|
|
259 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
260 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
261
|
50
|
262 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
263 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
264
|
56
|
265 tmp.rows = rbind(first.rows, second.rows)
|
|
266 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
267
|
|
268 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
269 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
270 } else {
|
|
271 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
272 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
273 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
274 } else {
|
58
|
275 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
49
|
276 }
|
|
277 }
|
|
278
|
59
|
279 } else if(nrow(first.rows) > 1) {
|
|
280 if(patient1[1,"Sample"] == first.sample){
|
|
281 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
282 patient1 = rbind(patient1, first.sum)
|
|
283 } else {
|
|
284 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
285 patient2 = rbind(patient2, first.sum)
|
|
286 }
|
|
287
|
|
288 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
289 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
290
|
|
291 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
292
|
|
293 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
37
|
294 } else {
|
|
295 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
296 }
|
|
297 }
|
51
|
298 patient.merge.list[[patient]] <<- patientMerge
|
|
299 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
300 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
301 }
|
51
|
302
|
52
|
303 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
304 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
305
|
37
|
306
|
18
|
307 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
308 res1 = vector()
|
|
309 res2 = vector()
|
|
310 resBoth = vector()
|
|
311 read1Count = vector()
|
|
312 read2Count = vector()
|
2
|
313 locussum1 = vector()
|
|
314 locussum2 = vector()
|
9
|
315
|
0
|
316 #for(iter in 1){
|
|
317 for(iter in 1:length(product[,1])){
|
|
318 threshhold = product[iter,threshholdIndex]
|
|
319 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
320 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
321 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
322 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
323 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
324 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
325 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
326 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
327 res1 = append(res1, sum(one))
|
2
|
328 res2 = append(res2, sum(two))
|
0
|
329 resBoth = append(resBoth, sum(both))
|
2
|
330 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
331 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
332 #threshhold = 0
|
|
333 if(threshhold != 0){
|
|
334 if(sum(one) > 0){
|
15
|
335 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
336 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
337 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
338 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
339 }
|
|
340 if(sum(two) > 0){
|
15
|
341 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
342 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
343 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
344 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
345 }
|
29
|
346 } else {
|
|
347 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
348 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
|
349 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
350 if(nrow(scatterplot_locus_data) > 0){
|
|
351 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
352 }
|
34
|
353 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
354 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
355 if(any(in_two)){
|
49
|
356 scatterplot_locus_data[in_two,]$type = twoSample
|
34
|
357 }
|
49
|
358 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
359 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
360 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
30
|
361 if(any(in_both)){
|
|
362 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
363 }
|
29
|
364 if(type == "single"){
|
|
365 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
366 }
|
|
367 p = NULL
|
|
368 if(nrow(scatterplot_locus_data) != 0){
|
|
369 if(on == "normalized_read_count"){
|
30
|
370 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
49
|
371 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
372 } else {
|
49
|
373 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
374 }
|
|
375 p = p + geom_point(aes(colour=type), position="jitter")
|
|
376 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
377 } else {
|
|
378 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
379 }
|
|
380 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
381 print(p)
|
|
382 dev.off()
|
0
|
383 }
|
|
384 if(sum(both) > 0){
|
15
|
385 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
386 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
387 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
388 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
389 }
|
0
|
390 }
|
2
|
391 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
392 if(sum(is.na(patientResult$percentage)) > 0){
|
|
393 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
394 }
|
|
395 colnames(patientResult)[6] = oneSample
|
|
396 colnames(patientResult)[8] = twoSample
|
|
397 colnamesBak = colnames(patientResult)
|
2
|
398 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
399 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
400 colnames(patientResult) = colnamesBak
|
|
401
|
|
402 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
403 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
404
|
|
405 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
406 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
407 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
408 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
409 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
410 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
411 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
412 print(plt)
|
|
413 dev.off()
|
|
414 #(t,r,b,l)
|
|
415 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
416 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
417 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
418 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
419 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
420 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
421 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
422 print(plt)
|
|
423 dev.off()
|
|
424
|
|
425 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
426 patientResult$relativeValue = patientResult$value * 10
|
|
427 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
428 plt = ggplot(patientResult)
|
|
429 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
430 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
431 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
432 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
433 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
434 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
435 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
436 print(plt)
|
|
437 dev.off()
|
|
438 }
|
|
439
|
3
|
440 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
441
|
0
|
442 interval = intervalFreq
|
|
443 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
444 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
445 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
446
|
3
|
447 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
448
|
0
|
449 interval = intervalReads
|
|
450 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
451 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
452 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
33
|
453
|
|
454 if(nrow(single_patients) > 0){
|
|
455 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
456 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
457 p = p + geom_point(aes(colour=type), position="jitter")
|
|
458 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
459 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
460 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
461 print(p)
|
|
462 dev.off()
|
7
|
463
|
33
|
464 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
465 p = p + geom_point(aes(colour=type), position="jitter")
|
|
466 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
467 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
468 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
469 print(p)
|
|
470 dev.off()
|
|
471 } else {
|
|
472 empty <- data.frame()
|
|
473 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
474
|
|
475 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
476 print(p)
|
|
477 dev.off()
|
|
478
|
|
479 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
480 print(p)
|
|
481 dev.off()
|
|
482 }
|
60
|
483
|
|
484 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
485 patient.merge.list.second = list()
|
|
486
|
7
|
487 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
488 onShort = "reads"
|
|
489 if(on == "Frequency"){
|
|
490 onShort = "freq"
|
|
491 }
|
18
|
492 onx = paste(on, ".x", sep="")
|
|
493 ony = paste(on, ".y", sep="")
|
|
494 onz = paste(on, ".z", sep="")
|
7
|
495 type="triplet"
|
|
496
|
|
497 threshholdIndex = which(colnames(product) == "interval")
|
|
498 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
499 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
500 titleIndex = which(colnames(product) == "Titles")
|
|
501 sampleIndex = which(colnames(patient1) == "Sample")
|
|
502 patientIndex = which(colnames(patient1) == "Patient")
|
|
503 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
504 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
505 threeSample = paste(patient3[1,sampleIndex], sep="")
|
60
|
506
|
29
|
507 if(mergeOn == "Clone_Sequence"){
|
|
508 patient1$merge = paste(patient1$Clone_Sequence)
|
|
509 patient2$merge = paste(patient2$Clone_Sequence)
|
|
510 patient3$merge = paste(patient3$Clone_Sequence)
|
|
511
|
|
512 } else {
|
|
513 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
514 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
515 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
516 }
|
60
|
517
|
|
518 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
519 patient1.fuzzy = patient1
|
|
520 patient2.fuzzy = patient2
|
|
521 patient3.fuzzy = patient3
|
|
522
|
|
523 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
|
524
|
|
525 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
526 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
527 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
|
528
|
|
529 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
530
|
|
531 other.sample.list = list()
|
|
532 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
533 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
534 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
|
535
|
9
|
536 patientMerge = merge(patient1, patient2, by="merge")
|
|
537 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
538 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
60
|
539 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
540 patientMerge = patientMerge[NULL,]
|
|
541
|
|
542 duo.merge.list = list()
|
|
543
|
20
|
544 patientMerge12 = merge(patient1, patient2, by="merge")
|
60
|
545 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
546 patientMerge12 = patientMerge12[NULL,]
|
|
547 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
548 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
|
549
|
20
|
550 patientMerge13 = merge(patient1, patient3, by="merge")
|
60
|
551 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
552 patientMerge13 = patientMerge13[NULL,]
|
|
553 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
554 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
|
555
|
20
|
556 patientMerge23 = merge(patient2, patient3, by="merge")
|
60
|
557 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
558 patientMerge23 = patientMerge23[NULL,]
|
|
559 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
560 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
|
561
|
|
562 merge.list = list()
|
|
563 merge.list[["second"]] = vector()
|
|
564
|
|
565 start.time = proc.time()
|
|
566 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
567 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
568 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
569 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
570 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
|
571
|
|
572 merge.list[["second"]] = patient.merge.list.second[[label1]]
|
|
573
|
|
574 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
575 } else {
|
|
576 while(nrow(patient.fuzzy) > 0){
|
|
577 first.merge = patient.fuzzy[1,"merge"]
|
|
578 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
579 first.sample = patient.fuzzy[1,"Sample"]
|
|
580
|
|
581 merge.filter = first.merge == patient.fuzzy$merge
|
|
582
|
|
583 second.sample = other.sample.list[[first.sample]][1]
|
|
584 third.sample = other.sample.list[[first.sample]][2]
|
|
585
|
|
586 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
587 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
588 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
|
589
|
|
590 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
591
|
|
592 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
593 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
594 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
|
595
|
|
596 matches.in.1 = any(match.filter.1)
|
|
597 matches.in.2 = any(match.filter.2)
|
|
598 matches.in.3 = any(match.filter.3)
|
|
599
|
|
600
|
|
601
|
|
602 rows.1 = patient.fuzzy[match.filter.1,]
|
|
603
|
|
604 sum.1 = data.frame(merge = first.clone.sequence,
|
|
605 Patient = label1,
|
|
606 Receptor = rows.1[1,"Receptor"],
|
|
607 Sample = rows.1[1,"Sample"],
|
|
608 Cell_Count = rows.1[1,"Cell_Count"],
|
|
609 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
610 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
611 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
612 dsPerM = sum(rows.1$dsPerM),
|
|
613 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
614 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
615 Clone_Sequence = first.clone.sequence,
|
|
616 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
617 Related_to_leukemia_clone = F,
|
|
618 Frequency = sum(rows.1$Frequency),
|
|
619 locus_V = rows.1[1,"locus_V"],
|
|
620 locus_J = rows.1[1,"locus_J"],
|
|
621 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
622 sum.2 = sum.1[NULL,]
|
|
623 rows.2 = patient.fuzzy[match.filter.2,]
|
|
624 if(matches.in.2){
|
|
625 sum.2 = data.frame(merge = first.clone.sequence,
|
|
626 Patient = label1,
|
|
627 Receptor = rows.2[1,"Receptor"],
|
|
628 Sample = rows.2[1,"Sample"],
|
|
629 Cell_Count = rows.2[1,"Cell_Count"],
|
|
630 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
631 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
632 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
633 dsPerM = sum(rows.2$dsPerM),
|
|
634 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
635 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
636 Clone_Sequence = first.clone.sequence,
|
|
637 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
638 Related_to_leukemia_clone = F,
|
|
639 Frequency = sum(rows.2$Frequency),
|
|
640 locus_V = rows.2[1,"locus_V"],
|
|
641 locus_J = rows.2[1,"locus_J"],
|
|
642 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
643 }
|
|
644
|
|
645 sum.3 = sum.1[NULL,]
|
|
646 rows.3 = patient.fuzzy[match.filter.3,]
|
|
647 if(matches.in.3){
|
|
648 sum.3 = data.frame(merge = first.clone.sequence,
|
|
649 Patient = label1,
|
|
650 Receptor = rows.3[1,"Receptor"],
|
|
651 Sample = rows.3[1,"Sample"],
|
|
652 Cell_Count = rows.3[1,"Cell_Count"],
|
|
653 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
654 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
655 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
656 dsPerM = sum(rows.3$dsPerM),
|
|
657 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
658 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
659 Clone_Sequence = first.clone.sequence,
|
|
660 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
661 Related_to_leukemia_clone = F,
|
|
662 Frequency = sum(rows.3$Frequency),
|
|
663 locus_V = rows.3[1,"locus_V"],
|
|
664 locus_J = rows.3[1,"locus_J"],
|
|
665 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
666 }
|
|
667
|
|
668 if(matches.in.2 & matches.in.3){
|
|
669 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
670 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
671 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
672 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
|
673
|
|
674 patientMerge = rbind(patientMerge, merge.123)
|
|
675 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
|
676
|
|
677 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
678 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
679
|
|
680 } else if (matches.in.2) {
|
|
681 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
682 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
683
|
|
684 second.sample = sum.2[,"Sample"]
|
|
685
|
|
686 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
687
|
|
688 merge.12 = merge(sum.1, sum.2, by="merge")
|
|
689
|
|
690 current.merge.list = rbind(current.merge.list, merge.12)
|
|
691 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
692
|
|
693 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
|
694
|
|
695 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
696 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
697
|
|
698 } else if (matches.in.3) {
|
|
699
|
|
700 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
701 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
702
|
|
703 second.sample = sum.3[,"Sample"]
|
|
704
|
|
705 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
706
|
|
707 merge.13 = merge(sum.1, sum.3, by="merge")
|
|
708
|
|
709 current.merge.list = rbind(current.merge.list, merge.13)
|
|
710 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
711
|
|
712 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
|
713
|
|
714 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
715 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
716
|
63
|
717 } else if(nrow(rows.1) > 1){
|
|
718 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
|
719 patient1 = rbind(patient1, sum.1)
|
|
720 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
60
|
721 } else {
|
|
722 patient.fuzzy = patient.fuzzy[-1,]
|
|
723 }
|
|
724
|
|
725 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
726 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
727
|
|
728 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
729 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
730 } else {
|
|
731 }
|
|
732
|
|
733 }
|
|
734 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
735
|
|
736 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
737 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
738 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
|
739
|
|
740 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
741 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
742 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
743
|
|
744 patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
745 }
|
|
746 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
747 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
748 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
749 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
20
|
750 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
60
|
751
|
63
|
752 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
753 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
754 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
62
|
755
|
60
|
756 if(F){
|
|
757 patientMerge = merge(patient1, patient2, by="merge")
|
|
758 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
759 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
760 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
761 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
762 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
763 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
764 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
765 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
766 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
767 }
|
28
|
768
|
30
|
769 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
770 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
771 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
772 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
773
|
7
|
774 res1 = vector()
|
|
775 res2 = vector()
|
|
776 res3 = vector()
|
20
|
777 res12 = vector()
|
|
778 res13 = vector()
|
|
779 res23 = vector()
|
7
|
780 resAll = vector()
|
|
781 read1Count = vector()
|
|
782 read2Count = vector()
|
|
783 read3Count = vector()
|
|
784
|
|
785 if(appendTriplets){
|
9
|
786 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
787 }
|
|
788 for(iter in 1:length(product[,1])){
|
|
789 threshhold = product[iter,threshholdIndex]
|
|
790 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
791 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
792 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
793 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
794
|
30
|
795 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
796 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
797 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
798
|
30
|
799 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
800 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
801 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
802
|
18
|
803 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
804 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
805 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
806 res1 = append(res1, sum(one))
|
|
807 res2 = append(res2, sum(two))
|
|
808 res3 = append(res3, sum(three))
|
|
809 resAll = append(resAll, sum(all))
|
20
|
810 res12 = append(res12, sum(one_two))
|
|
811 res13 = append(res13, sum(one_three))
|
|
812 res23 = append(res23, sum(two_three))
|
7
|
813 #threshhold = 0
|
|
814 if(threshhold != 0){
|
|
815 if(sum(one) > 0){
|
20
|
816 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
817 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
818 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
819 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
820 }
|
|
821 if(sum(two) > 0){
|
20
|
822 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
823 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
824 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
825 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
826 }
|
|
827 if(sum(three) > 0){
|
20
|
828 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
829 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
830 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
831 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
832 }
|
20
|
833 if(sum(one_two) > 0){
|
|
834 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
835 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
836 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
837 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
838 }
|
|
839 if(sum(one_three) > 0){
|
|
840 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
841 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
842 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
843 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
844 }
|
|
845 if(sum(two_three) > 0){
|
|
846 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
847 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
848 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
849 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
850 }
|
|
851 } else { #scatterplot data
|
24
|
852 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
60
|
853 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
30
|
854 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
855 if(sum(in_two) > 0){
|
|
856 scatterplot_locus_data[in_two,]$type = "In two"
|
|
857 }
|
30
|
858 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
859 if(sum(in_three)> 0){
|
|
860 scatterplot_locus_data[in_three,]$type = "In three"
|
|
861 }
|
|
862 not_in_one = scatterplot_locus_data$type != "In one"
|
|
863 if(sum(not_in_one) > 0){
|
|
864 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
865 }
|
20
|
866 p = NULL
|
|
867 if(nrow(scatterplot_locus_data) != 0){
|
|
868 if(on == "normalized_read_count"){
|
31
|
869 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
870 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
871 } else {
|
32
|
872 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
873 }
|
|
874 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
875 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
876 } else {
|
|
877 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
878 }
|
|
879 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
880 print(p)
|
|
881 dev.off()
|
|
882 }
|
7
|
883 if(sum(all) > 0){
|
20
|
884 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
885 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
886 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
887 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
888 }
|
|
889 }
|
20
|
890 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
891 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
892 colnames(patientResult)[6] = oneSample
|
20
|
893 colnames(patientResult)[7] = twoSample
|
|
894 colnames(patientResult)[8] = threeSample
|
|
895 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
896 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
897 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
898
|
|
899 colnamesBak = colnames(patientResult)
|
20
|
900 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
901 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
902 colnames(patientResult) = colnamesBak
|
|
903
|
|
904 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
905 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
906
|
|
907 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
908 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
909 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
910 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
911 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
912 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
913 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
914 print(plt)
|
|
915 dev.off()
|
|
916
|
|
917 fontSize = 4
|
|
918
|
|
919 bak = patientResult
|
|
920 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
921 patientResult$relativeValue = patientResult$value * 10
|
|
922 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
923 plt = ggplot(patientResult)
|
|
924 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
925 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
926 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
927 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
928 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
929 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
930 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
931 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
932 print(plt)
|
|
933 dev.off()
|
|
934 }
|
|
935
|
33
|
936 if(nrow(triplets) != 0){
|
60
|
937
|
|
938 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
939
|
33
|
940 triplets$uniqueID = "ID"
|
|
941
|
|
942 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
943 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
944 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
945
|
|
946 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
947 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
948 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
949
|
|
950 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
60
|
951
|
|
952 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
953
|
33
|
954 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
955 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
956 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
957
|
|
958 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
959 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
960
|
|
961 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
962
|
|
963 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
964
|
|
965 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
966
|
|
967 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
968
|
60
|
969 column_drops = c("min_cell_count", "min_cell_paste")
|
33
|
970
|
|
971 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
60
|
972
|
|
973 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
974
|
33
|
975 interval = intervalReads
|
|
976 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
977 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
978
|
|
979 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
980 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
981 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
982 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
983
|
|
984 one = triplets[triplets$Sample == "16278_Left",]
|
|
985 two = triplets[triplets$Sample == "26402_Left",]
|
|
986 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
987 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
988
|
|
989 one = triplets[triplets$Sample == "16278_Right",]
|
|
990 two = triplets[triplets$Sample == "26402_Right",]
|
|
991 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
992 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
993
|
60
|
994 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
995
|
33
|
996 interval = intervalFreq
|
|
997 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
998 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
999
|
|
1000 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1001 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1002 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1003 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1004
|
|
1005 one = triplets[triplets$Sample == "16278_Left",]
|
|
1006 two = triplets[triplets$Sample == "26402_Left",]
|
|
1007 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1008 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1009
|
|
1010 one = triplets[triplets$Sample == "16278_Right",]
|
|
1011 two = triplets[triplets$Sample == "26402_Right",]
|
|
1012 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1013 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1014 } else {
|
|
1015 cat("", file="triplets.txt")
|
|
1016 }
|
60
|
1017 cat("</table></html>", file=logfile, append=T) |