0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
22
|
57 patients = split(dat, dat$Patient, drop=T)
|
9
|
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
63 Titles = factor(Titles, levels=Titles)
|
|
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
65
|
29
|
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
67
|
51
|
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
69 patient.merge.list.second = list()
|
54
|
70 cat(paste("<table border='0'>", sep=""), file="multiple_matches.html", append=T)
|
0
|
71 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
72 if (!is.data.frame(x) & is.list(x)){
|
|
73 x = x[[1]]
|
|
74 }
|
34
|
75 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
76 x = data.frame(x,stringsAsFactors=F)
|
0
|
77 onShort = "reads"
|
|
78 if(on == "Frequency"){
|
|
79 onShort = "freq"
|
|
80 }
|
18
|
81 onx = paste(on, ".x", sep="")
|
|
82 ony = paste(on, ".y", sep="")
|
0
|
83 splt = split(x, x$Sample, drop=T)
|
4
|
84 type="pair"
|
2
|
85 if(length(splt) == 1){
|
3
|
86 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
87 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
88 type="single"
|
2
|
89 }
|
0
|
90 patient1 = splt[[1]]
|
|
91 patient2 = splt[[2]]
|
|
92
|
|
93 threshholdIndex = which(colnames(product) == "interval")
|
|
94 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
95 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
96 titleIndex = which(colnames(product) == "Titles")
|
|
97 sampleIndex = which(colnames(x) == "Sample")
|
|
98 patientIndex = which(colnames(x) == "Patient")
|
|
99 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
100 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
101 patient = paste(x[1,patientIndex])
|
3
|
102
|
0
|
103 switched = F
|
|
104 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
105 tmp = twoSample
|
|
106 twoSample = oneSample
|
|
107 oneSample = tmp
|
|
108 tmp = patient1
|
|
109 patient1 = patient2
|
|
110 patient2 = tmp
|
|
111 switched = T
|
|
112 }
|
|
113 if(appendtxt){
|
4
|
114 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
115 }
|
51
|
116 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
117
|
29
|
118 if(mergeOn == "Clone_Sequence"){
|
|
119 patient1$merge = paste(patient1$Clone_Sequence)
|
|
120 patient2$merge = paste(patient2$Clone_Sequence)
|
|
121 } else {
|
|
122 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
123 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
124 }
|
9
|
125
|
30
|
126 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
127 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
128 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
129 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
130 scatterplot_data$on = onShort
|
|
131
|
49
|
132 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
133 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
134
|
|
135 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
136
|
51
|
137 start.time = proc.time()
|
|
138 merge.list = c()
|
|
139
|
|
140 if(patient %in% names(patient.merge.list)){
|
|
141 patientMerge = patient.merge.list[[patient]]
|
|
142 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
143 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
144
|
|
145 print(names(patient.merge.list))
|
|
146 } else {
|
|
147 #fuzzy matching here...
|
49
|
148 #merge.list = patientMerge$merge
|
51
|
149
|
49
|
150 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
151 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
152
|
|
153 patient1.fuzzy = patient1
|
|
154 patient2.fuzzy = patient2
|
|
155
|
36
|
156 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
157 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
158
|
48
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
161
|
48
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
164
|
49
|
165 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
166 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
167
|
49
|
168 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
173
|
|
174 merge.list = list()
|
|
175
|
|
176 merge.list[["second"]] = vector()
|
|
177
|
37
|
178 while(nrow(patient.fuzzy) > 1){
|
|
179 first.merge = patient.fuzzy[1,"merge"]
|
|
180 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
181 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
182 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
183
|
49
|
184 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
185
|
49
|
186 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
187 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
188
|
|
189 #first match same sample, sum to a single row, same for other sample
|
|
190 #then merge rows like 'normal'
|
51
|
191
|
48
|
192 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
193
|
|
194
|
|
195
|
44
|
196 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
197 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
198 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
199
|
|
200 first.rows = patient.fuzzy[first.match.filter,]
|
|
201 second.rows = patient.fuzzy[second.match.filter,]
|
|
202
|
50
|
203 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
204 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
205 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
206 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
207
|
49
|
208 first.sum = data.frame(merge = first.clone.sequence,
|
|
209 Patient = patient,
|
|
210 Receptor = first.rows[1,"Receptor"],
|
|
211 Sample = first.rows[1,"Sample"],
|
|
212 Cell_Count = first.rows[1,"Cell_Count"],
|
|
213 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
214 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
215 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
216 dsPerM = sum(first.rows$dsPerM),
|
50
|
217 J_Segment_Major_Gene = first.rows.j,
|
|
218 V_Segment_Major_Gene = first.rows.v,
|
49
|
219 Clone_Sequence = first.clone.sequence,
|
|
220 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
221 Related_to_leukemia_clone = F,
|
|
222 Frequency = sum(first.rows$Frequency),
|
|
223 locus_V = first.rows[1,"locus_V"],
|
|
224 locus_J = first.rows[1,"locus_J"],
|
|
225 min_cell_count = first.rows[1,"min_cell_count"],
|
|
226 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
227 paste = first.rows[1,"paste"],
|
|
228 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
229
|
|
230 if(nrow(second.rows) > 0){
|
50
|
231 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
232 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
233 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
234 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
235
|
54
|
236 second.sum = data.frame(merge = paste(first.clone.sequence, "*", sep=""),
|
49
|
237 Patient = patient,
|
|
238 Receptor = second.rows[1,"Receptor"],
|
|
239 Sample = second.rows[1,"Sample"],
|
|
240 Cell_Count = second.rows[1,"Cell_Count"],
|
|
241 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
242 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
243 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
244 dsPerM = sum(second.rows$dsPerM),
|
50
|
245 J_Segment_Major_Gene = second.rows.j,
|
|
246 V_Segment_Major_Gene = second.rows.v,
|
49
|
247 Clone_Sequence = first.clone.sequence,
|
|
248 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
249 Related_to_leukemia_clone = F,
|
|
250 Frequency = sum(second.rows$Frequency),
|
|
251 locus_V = second.rows[1,"locus_V"],
|
|
252 locus_J = second.rows[1,"locus_J"],
|
|
253 min_cell_count = second.rows[1,"min_cell_count"],
|
|
254 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
255 paste = second.rows[1,"paste"],
|
|
256 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
257
|
|
258 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
259 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
260
|
50
|
261 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
262 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
263
|
|
264 if(sum(first.match.filter) == 1 & sum(second.match.filter) == 1){
|
|
265 second.clone.sequence = patient.fuzzy[second.match.filter, "Clone_Sequence"]
|
50
|
266 if(first.clone.sequence != second.clone.sequence){
|
|
267 #merge.list[["second"]] = append(merge.list[["second"]], second.clone.sequence)
|
49
|
268 }
|
|
269 }
|
|
270
|
|
271 if(nrow(first.rows) > 1 | nrow(second.rows) > 1){
|
54
|
272 cat(paste("<tr><td>", patient, ": ", nrow(first.rows), " ", nrow(second.rows), " ", first.clone.sequence, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
49
|
273 }
|
|
274
|
37
|
275 } else {
|
|
276 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
277 }
|
|
278 }
|
51
|
279 patient.merge.list[[patient]] <<- patientMerge
|
|
280 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
281 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
282 }
|
51
|
283
|
52
|
284 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
285 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
286
|
37
|
287
|
18
|
288 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
289 res1 = vector()
|
|
290 res2 = vector()
|
|
291 resBoth = vector()
|
|
292 read1Count = vector()
|
|
293 read2Count = vector()
|
2
|
294 locussum1 = vector()
|
|
295 locussum2 = vector()
|
9
|
296
|
0
|
297 #for(iter in 1){
|
|
298 for(iter in 1:length(product[,1])){
|
|
299 threshhold = product[iter,threshholdIndex]
|
|
300 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
301 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
302 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
303 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
304 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
305 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
306 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
307 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
308 res1 = append(res1, sum(one))
|
2
|
309 res2 = append(res2, sum(two))
|
0
|
310 resBoth = append(resBoth, sum(both))
|
2
|
311 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
312 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
313 #threshhold = 0
|
|
314 if(threshhold != 0){
|
|
315 if(sum(one) > 0){
|
15
|
316 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
317 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
318 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
319 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
320 }
|
|
321 if(sum(two) > 0){
|
15
|
322 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
323 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
324 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
325 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
326 }
|
29
|
327 } else {
|
|
328 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
329 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
|
330 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
331 if(nrow(scatterplot_locus_data) > 0){
|
|
332 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
333 }
|
34
|
334 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
335 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
336 if(any(in_two)){
|
49
|
337 scatterplot_locus_data[in_two,]$type = twoSample
|
34
|
338 }
|
49
|
339 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
340 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
341 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
30
|
342 if(any(in_both)){
|
|
343 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
344 }
|
29
|
345 if(type == "single"){
|
|
346 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
347 }
|
|
348 p = NULL
|
|
349 if(nrow(scatterplot_locus_data) != 0){
|
|
350 if(on == "normalized_read_count"){
|
30
|
351 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
49
|
352 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
353 } else {
|
49
|
354 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
355 }
|
|
356 p = p + geom_point(aes(colour=type), position="jitter")
|
|
357 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
358 } else {
|
|
359 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
360 }
|
|
361 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
362 print(p)
|
|
363 dev.off()
|
0
|
364 }
|
|
365 if(sum(both) > 0){
|
15
|
366 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
367 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
368 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
369 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
370 }
|
0
|
371 }
|
2
|
372 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
373 if(sum(is.na(patientResult$percentage)) > 0){
|
|
374 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
375 }
|
|
376 colnames(patientResult)[6] = oneSample
|
|
377 colnames(patientResult)[8] = twoSample
|
|
378 colnamesBak = colnames(patientResult)
|
2
|
379 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
380 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
381 colnames(patientResult) = colnamesBak
|
|
382
|
|
383 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
384 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
385
|
|
386 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
387 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
388 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
389 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
390 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
391 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
392 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
393 print(plt)
|
|
394 dev.off()
|
|
395 #(t,r,b,l)
|
|
396 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
397 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
398 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
399 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
400 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
401 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
402 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
403 print(plt)
|
|
404 dev.off()
|
|
405
|
|
406 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
407 patientResult$relativeValue = patientResult$value * 10
|
|
408 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
409 plt = ggplot(patientResult)
|
|
410 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
411 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
412 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
413 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
414 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
415 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
416 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
417 print(plt)
|
|
418 dev.off()
|
|
419 }
|
|
420
|
3
|
421 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
422
|
0
|
423 interval = intervalFreq
|
|
424 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
425 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
426 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
427
|
3
|
428 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
429
|
0
|
430 interval = intervalReads
|
|
431 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
432 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
433 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
434
|
3
|
435 cat("</table></html>", file=logfile, append=T)
|
|
436
|
33
|
437
|
|
438 if(nrow(single_patients) > 0){
|
|
439 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
440 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
441 p = p + geom_point(aes(colour=type), position="jitter")
|
|
442 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
443 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
444 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
445 print(p)
|
|
446 dev.off()
|
7
|
447
|
33
|
448 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
449 p = p + geom_point(aes(colour=type), position="jitter")
|
|
450 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
451 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
452 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
453 print(p)
|
|
454 dev.off()
|
|
455 } else {
|
|
456 empty <- data.frame()
|
|
457 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
458
|
|
459 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
460 print(p)
|
|
461 dev.off()
|
|
462
|
|
463 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
464 print(p)
|
|
465 dev.off()
|
|
466 }
|
7
|
467 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
468 onShort = "reads"
|
|
469 if(on == "Frequency"){
|
|
470 onShort = "freq"
|
|
471 }
|
18
|
472 onx = paste(on, ".x", sep="")
|
|
473 ony = paste(on, ".y", sep="")
|
|
474 onz = paste(on, ".z", sep="")
|
7
|
475 type="triplet"
|
|
476
|
|
477 threshholdIndex = which(colnames(product) == "interval")
|
|
478 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
479 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
480 titleIndex = which(colnames(product) == "Titles")
|
|
481 sampleIndex = which(colnames(patient1) == "Sample")
|
|
482 patientIndex = which(colnames(patient1) == "Patient")
|
|
483 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
484 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
485 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
486
|
29
|
487 if(mergeOn == "Clone_Sequence"){
|
|
488 patient1$merge = paste(patient1$Clone_Sequence)
|
|
489 patient2$merge = paste(patient2$Clone_Sequence)
|
|
490 patient3$merge = paste(patient3$Clone_Sequence)
|
|
491
|
|
492 } else {
|
|
493 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
494 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
495 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
496 }
|
9
|
497
|
|
498 patientMerge = merge(patient1, patient2, by="merge")
|
|
499 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
500 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
501 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
502 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
503 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
504 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
505 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
506 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
507 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
508
|
28
|
509
|
30
|
510 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
511 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
512 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
513 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
514
|
7
|
515 res1 = vector()
|
|
516 res2 = vector()
|
|
517 res3 = vector()
|
20
|
518 res12 = vector()
|
|
519 res13 = vector()
|
|
520 res23 = vector()
|
7
|
521 resAll = vector()
|
|
522 read1Count = vector()
|
|
523 read2Count = vector()
|
|
524 read3Count = vector()
|
|
525
|
|
526 if(appendTriplets){
|
9
|
527 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
528 }
|
|
529 for(iter in 1:length(product[,1])){
|
|
530 threshhold = product[iter,threshholdIndex]
|
|
531 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
532 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
533 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
534 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
535
|
30
|
536 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
537 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
538 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
539
|
30
|
540 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
541 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
542 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
543
|
18
|
544 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
545 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
546 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
547 res1 = append(res1, sum(one))
|
|
548 res2 = append(res2, sum(two))
|
|
549 res3 = append(res3, sum(three))
|
|
550 resAll = append(resAll, sum(all))
|
20
|
551 res12 = append(res12, sum(one_two))
|
|
552 res13 = append(res13, sum(one_three))
|
|
553 res23 = append(res23, sum(two_three))
|
7
|
554 #threshhold = 0
|
|
555 if(threshhold != 0){
|
|
556 if(sum(one) > 0){
|
20
|
557 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
558 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
559 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
560 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
561 }
|
|
562 if(sum(two) > 0){
|
20
|
563 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
564 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
565 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
566 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
567 }
|
|
568 if(sum(three) > 0){
|
20
|
569 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
570 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
571 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
572 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
573 }
|
20
|
574 if(sum(one_two) > 0){
|
|
575 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
576 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
577 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
578 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
579 }
|
|
580 if(sum(one_three) > 0){
|
|
581 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
582 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
583 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
584 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
585 }
|
|
586 if(sum(two_three) > 0){
|
|
587 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
588 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
589 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
590 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
591 }
|
|
592 } else { #scatterplot data
|
24
|
593 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
594 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
595 if(sum(in_two) > 0){
|
|
596 scatterplot_locus_data[in_two,]$type = "In two"
|
|
597 }
|
30
|
598 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
599 if(sum(in_three)> 0){
|
|
600 scatterplot_locus_data[in_three,]$type = "In three"
|
|
601 }
|
|
602 not_in_one = scatterplot_locus_data$type != "In one"
|
|
603 if(sum(not_in_one) > 0){
|
|
604 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
605 }
|
20
|
606 p = NULL
|
|
607 if(nrow(scatterplot_locus_data) != 0){
|
|
608 if(on == "normalized_read_count"){
|
31
|
609 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
610 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
611 } else {
|
32
|
612 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
613 }
|
|
614 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
615 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
616 } else {
|
|
617 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
618 }
|
|
619 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
620 print(p)
|
|
621 dev.off()
|
|
622 }
|
7
|
623 if(sum(all) > 0){
|
20
|
624 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
625 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
626 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
627 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
628 }
|
|
629 }
|
20
|
630 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
631 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
632 colnames(patientResult)[6] = oneSample
|
20
|
633 colnames(patientResult)[7] = twoSample
|
|
634 colnames(patientResult)[8] = threeSample
|
|
635 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
636 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
637 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
638
|
|
639 colnamesBak = colnames(patientResult)
|
20
|
640 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
641 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
642 colnames(patientResult) = colnamesBak
|
|
643
|
|
644 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
645 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
646
|
|
647 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
648 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
649 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
650 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
651 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
652 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
653 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
654 print(plt)
|
|
655 dev.off()
|
|
656
|
|
657 fontSize = 4
|
|
658
|
|
659 bak = patientResult
|
|
660 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
661 patientResult$relativeValue = patientResult$value * 10
|
|
662 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
663 plt = ggplot(patientResult)
|
|
664 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
665 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
666 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
667 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
668 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
669 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
670 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
671 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
672 print(plt)
|
|
673 dev.off()
|
|
674 }
|
|
675
|
33
|
676 if(nrow(triplets) != 0){
|
|
677 triplets$uniqueID = "ID"
|
|
678
|
|
679 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
680 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
681 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
682
|
|
683 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
684 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
685 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
686
|
|
687 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
688
|
|
689 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
690 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
691 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
692
|
|
693 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
694 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
695
|
|
696 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
697
|
|
698 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
699
|
|
700 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
701
|
|
702 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
703
|
|
704 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
705
|
|
706 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
707
|
|
708 #remove duplicate V+J+CDR3, add together numerical values
|
|
709 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
710 Cell_Count=unique(.SD$Cell_Count),
|
|
711 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
712 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
713 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
714 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
715 Frequency=sum(.SD$Frequency),
|
|
716 normalized_read_count=sum(.SD$normalized_read_count),
|
|
717 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
718 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
719
|
|
720
|
|
721 interval = intervalReads
|
|
722 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
723 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
724
|
|
725 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
726 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
727 three = triplets[triplets$Sample == "24062_reg_BM",]
|
53
|
728 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
729
|
|
730 one = triplets[triplets$Sample == "16278_Left",]
|
|
731 two = triplets[triplets$Sample == "26402_Left",]
|
|
732 three = triplets[triplets$Sample == "26759_Left",]
|
53
|
733 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
734
|
|
735 one = triplets[triplets$Sample == "16278_Right",]
|
|
736 two = triplets[triplets$Sample == "26402_Right",]
|
|
737 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
738 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
739
|
|
740
|
|
741 interval = intervalFreq
|
|
742 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
743 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
744
|
|
745 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
746 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
747 three = triplets[triplets$Sample == "24062_reg_BM",]
|
53
|
748 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
749
|
|
750 one = triplets[triplets$Sample == "16278_Left",]
|
|
751 two = triplets[triplets$Sample == "26402_Left",]
|
|
752 three = triplets[triplets$Sample == "26759_Left",]
|
53
|
753 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
754
|
|
755 one = triplets[triplets$Sample == "16278_Right",]
|
|
756 two = triplets[triplets$Sample == "26402_Right",]
|
|
757 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
758 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
759 } else {
|
|
760 cat("", file="triplets.txt")
|
|
761 }
|