0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
22
|
57 patients = split(dat, dat$Patient, drop=T)
|
9
|
58 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
59 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
60 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
61 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
62 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
63 Titles = factor(Titles, levels=Titles)
|
|
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
65
|
29
|
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
67
|
0
|
68 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
69 if (!is.data.frame(x) & is.list(x)){
|
|
70 x = x[[1]]
|
|
71 }
|
34
|
72 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
73 x = data.frame(x,stringsAsFactors=F)
|
0
|
74 onShort = "reads"
|
|
75 if(on == "Frequency"){
|
|
76 onShort = "freq"
|
|
77 }
|
18
|
78 onx = paste(on, ".x", sep="")
|
|
79 ony = paste(on, ".y", sep="")
|
0
|
80 splt = split(x, x$Sample, drop=T)
|
4
|
81 type="pair"
|
2
|
82 if(length(splt) == 1){
|
3
|
83 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
84 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
85 type="single"
|
2
|
86 }
|
0
|
87 patient1 = splt[[1]]
|
|
88 patient2 = splt[[2]]
|
|
89
|
|
90 threshholdIndex = which(colnames(product) == "interval")
|
|
91 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
92 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
93 titleIndex = which(colnames(product) == "Titles")
|
|
94 sampleIndex = which(colnames(x) == "Sample")
|
|
95 patientIndex = which(colnames(x) == "Patient")
|
|
96 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
97 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
98 patient = paste(x[1,patientIndex])
|
3
|
99
|
0
|
100 switched = F
|
|
101 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
102 tmp = twoSample
|
|
103 twoSample = oneSample
|
|
104 oneSample = tmp
|
|
105 tmp = patient1
|
|
106 patient1 = patient2
|
|
107 patient2 = tmp
|
|
108 switched = T
|
|
109 }
|
|
110 if(appendtxt){
|
4
|
111 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
112 }
|
3
|
113 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
114
|
29
|
115 if(mergeOn == "Clone_Sequence"){
|
|
116 patient1$merge = paste(patient1$Clone_Sequence)
|
|
117 patient2$merge = paste(patient2$Clone_Sequence)
|
|
118 } else {
|
|
119 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
120 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
121 }
|
9
|
122
|
30
|
123 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
124 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
125 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
126 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
127 scatterplot_data$on = onShort
|
|
128
|
49
|
129 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
130 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
131
|
|
132 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
133
|
34
|
134
|
|
135 #fuzzy matching here...
|
|
136 if(mergeOn == "Clone_Sequence"){
|
49
|
137 #merge.list = patientMerge$merge
|
34
|
138
|
49
|
139 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
140 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
141
|
|
142 patient1.fuzzy = patient1
|
|
143 patient2.fuzzy = patient2
|
|
144
|
36
|
145 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
146 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
147
|
48
|
148 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
149 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
150
|
|
151 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
152 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
34
|
153
|
49
|
154 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
155 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
34
|
156
|
49
|
157 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
158 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
34
|
159
|
37
|
160 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
161 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
162
|
|
163 merge.list = list()
|
|
164
|
|
165 merge.list[["second"]] = vector()
|
|
166
|
|
167
|
37
|
168 while(nrow(patient.fuzzy) > 1){
|
|
169 first.merge = patient.fuzzy[1,"merge"]
|
|
170 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
171 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
172 merge.filter = first.merge == patient.fuzzy$merge
|
|
173
|
49
|
174 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
42
|
175
|
49
|
176 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
177 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
178
|
|
179 #first match same sample, sum to a single row, same for other sample
|
|
180 #then merge rows like 'normal'
|
43
|
181
|
48
|
182 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
183
|
|
184
|
|
185
|
44
|
186 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
187 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
188 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
189
|
|
190 first.rows = patient.fuzzy[first.match.filter,]
|
|
191 second.rows = patient.fuzzy[second.match.filter,]
|
|
192
|
50
|
193 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
194 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
195 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
196 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
197
|
49
|
198 first.sum = data.frame(merge = first.clone.sequence,
|
|
199 Patient = patient,
|
|
200 Receptor = first.rows[1,"Receptor"],
|
|
201 Sample = first.rows[1,"Sample"],
|
|
202 Cell_Count = first.rows[1,"Cell_Count"],
|
|
203 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
204 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
205 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
206 dsPerM = sum(first.rows$dsPerM),
|
50
|
207 J_Segment_Major_Gene = first.rows.j,
|
|
208 V_Segment_Major_Gene = first.rows.v,
|
49
|
209 Clone_Sequence = first.clone.sequence,
|
|
210 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
211 Related_to_leukemia_clone = F,
|
|
212 Frequency = sum(first.rows$Frequency),
|
|
213 locus_V = first.rows[1,"locus_V"],
|
|
214 locus_J = first.rows[1,"locus_J"],
|
|
215 min_cell_count = first.rows[1,"min_cell_count"],
|
|
216 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
217 paste = first.rows[1,"paste"],
|
|
218 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
219
|
|
220 if(nrow(second.rows) > 0){
|
50
|
221 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
222 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
223 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
224 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
225
|
49
|
226 second.sum = data.frame(merge = first.clone.sequence,
|
|
227 Patient = patient,
|
|
228 Receptor = second.rows[1,"Receptor"],
|
|
229 Sample = second.rows[1,"Sample"],
|
|
230 Cell_Count = second.rows[1,"Cell_Count"],
|
|
231 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
232 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
233 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
234 dsPerM = sum(second.rows$dsPerM),
|
50
|
235 J_Segment_Major_Gene = second.rows.j,
|
|
236 V_Segment_Major_Gene = second.rows.v,
|
49
|
237 Clone_Sequence = first.clone.sequence,
|
|
238 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
239 Related_to_leukemia_clone = F,
|
|
240 Frequency = sum(second.rows$Frequency),
|
|
241 locus_V = second.rows[1,"locus_V"],
|
|
242 locus_J = second.rows[1,"locus_J"],
|
|
243 min_cell_count = second.rows[1,"min_cell_count"],
|
|
244 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
245 paste = second.rows[1,"paste"],
|
|
246 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
247
|
|
248 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
249 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
250
|
50
|
251 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
252 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
253
|
|
254 if(sum(first.match.filter) == 1 & sum(second.match.filter) == 1){
|
|
255 second.clone.sequence = patient.fuzzy[second.match.filter, "Clone_Sequence"]
|
50
|
256 if(first.clone.sequence != second.clone.sequence){
|
|
257 #merge.list[["second"]] = append(merge.list[["second"]], second.clone.sequence)
|
49
|
258 }
|
|
259 }
|
|
260
|
|
261 if(nrow(first.rows) > 1 | nrow(second.rows) > 1){
|
|
262
|
|
263 }
|
|
264
|
37
|
265 } else {
|
|
266 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
267 }
|
|
268 }
|
|
269
|
|
270 }
|
|
271
|
37
|
272
|
18
|
273 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
274 res1 = vector()
|
|
275 res2 = vector()
|
|
276 resBoth = vector()
|
|
277 read1Count = vector()
|
|
278 read2Count = vector()
|
2
|
279 locussum1 = vector()
|
|
280 locussum2 = vector()
|
9
|
281
|
0
|
282 #for(iter in 1){
|
|
283 for(iter in 1:length(product[,1])){
|
|
284 threshhold = product[iter,threshholdIndex]
|
|
285 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
286 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
287 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
288 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
289 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
290 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
291 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
292 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
293 res1 = append(res1, sum(one))
|
2
|
294 res2 = append(res2, sum(two))
|
0
|
295 resBoth = append(resBoth, sum(both))
|
2
|
296 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
297 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
298 #threshhold = 0
|
|
299 if(threshhold != 0){
|
|
300 if(sum(one) > 0){
|
15
|
301 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
302 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
303 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
304 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
305 }
|
|
306 if(sum(two) > 0){
|
15
|
307 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
308 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
309 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
310 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
311 }
|
29
|
312 } else {
|
|
313 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
49
|
314 #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),]
|
|
315 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
29
|
316 if(nrow(scatterplot_locus_data) > 0){
|
|
317 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
318 }
|
34
|
319 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
320 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
321 if(any(in_two)){
|
49
|
322 scatterplot_locus_data[in_two,]$type = twoSample
|
34
|
323 }
|
49
|
324 in_both = (scatterplot_locus_data$merge %in% patientMerge$merge)
|
|
325 #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]])
|
|
326 #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches)
|
30
|
327 if(any(in_both)){
|
|
328 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
329 }
|
29
|
330 if(type == "single"){
|
|
331 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
332 }
|
|
333 p = NULL
|
|
334 if(nrow(scatterplot_locus_data) != 0){
|
|
335 if(on == "normalized_read_count"){
|
30
|
336 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
49
|
337 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
338 } else {
|
49
|
339 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
340 }
|
|
341 p = p + geom_point(aes(colour=type), position="jitter")
|
|
342 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
343 } else {
|
|
344 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
345 }
|
|
346 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
347 print(p)
|
|
348 dev.off()
|
0
|
349 }
|
|
350 if(sum(both) > 0){
|
15
|
351 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
352 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
353 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
354 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
355 }
|
0
|
356 }
|
2
|
357 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
358 if(sum(is.na(patientResult$percentage)) > 0){
|
|
359 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
360 }
|
|
361 colnames(patientResult)[6] = oneSample
|
|
362 colnames(patientResult)[8] = twoSample
|
|
363 colnamesBak = colnames(patientResult)
|
2
|
364 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
365 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
366 colnames(patientResult) = colnamesBak
|
|
367
|
|
368 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
369 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
370
|
|
371 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
372 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
373 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
374 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
375 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
376 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
377 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
378 print(plt)
|
|
379 dev.off()
|
|
380 #(t,r,b,l)
|
|
381 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
382 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
383 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
384 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
385 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
386 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
387 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
388 print(plt)
|
|
389 dev.off()
|
|
390
|
|
391 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
392 patientResult$relativeValue = patientResult$value * 10
|
|
393 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
394 plt = ggplot(patientResult)
|
|
395 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
396 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
397 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
398 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
399 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
400 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
401 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
402 print(plt)
|
|
403 dev.off()
|
|
404 }
|
|
405
|
3
|
406 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
407
|
0
|
408 interval = intervalFreq
|
|
409 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
410 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
411 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
412
|
3
|
413 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
414
|
0
|
415 interval = intervalReads
|
|
416 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
417 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
418 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
419
|
3
|
420 cat("</table></html>", file=logfile, append=T)
|
|
421
|
33
|
422
|
|
423 if(nrow(single_patients) > 0){
|
|
424 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
425 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
426 p = p + geom_point(aes(colour=type), position="jitter")
|
|
427 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
428 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
429 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
430 print(p)
|
|
431 dev.off()
|
7
|
432
|
33
|
433 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
434 p = p + geom_point(aes(colour=type), position="jitter")
|
|
435 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
436 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
437 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
438 print(p)
|
|
439 dev.off()
|
|
440 } else {
|
|
441 empty <- data.frame()
|
|
442 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
443
|
|
444 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
445 print(p)
|
|
446 dev.off()
|
|
447
|
|
448 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
449 print(p)
|
|
450 dev.off()
|
|
451 }
|
7
|
452 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
453 onShort = "reads"
|
|
454 if(on == "Frequency"){
|
|
455 onShort = "freq"
|
|
456 }
|
18
|
457 onx = paste(on, ".x", sep="")
|
|
458 ony = paste(on, ".y", sep="")
|
|
459 onz = paste(on, ".z", sep="")
|
7
|
460 type="triplet"
|
|
461
|
|
462 threshholdIndex = which(colnames(product) == "interval")
|
|
463 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
464 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
465 titleIndex = which(colnames(product) == "Titles")
|
|
466 sampleIndex = which(colnames(patient1) == "Sample")
|
|
467 patientIndex = which(colnames(patient1) == "Patient")
|
|
468 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
469 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
470 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
471
|
29
|
472 if(mergeOn == "Clone_Sequence"){
|
|
473 patient1$merge = paste(patient1$Clone_Sequence)
|
|
474 patient2$merge = paste(patient2$Clone_Sequence)
|
|
475 patient3$merge = paste(patient3$Clone_Sequence)
|
|
476
|
|
477 } else {
|
|
478 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
479 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
480 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
481 }
|
9
|
482
|
|
483 patientMerge = merge(patient1, patient2, by="merge")
|
|
484 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
485 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
486 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
487 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
488 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
489 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
490 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
491 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
492 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
493
|
28
|
494
|
30
|
495 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
496 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
497 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
498 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
499
|
7
|
500 res1 = vector()
|
|
501 res2 = vector()
|
|
502 res3 = vector()
|
20
|
503 res12 = vector()
|
|
504 res13 = vector()
|
|
505 res23 = vector()
|
7
|
506 resAll = vector()
|
|
507 read1Count = vector()
|
|
508 read2Count = vector()
|
|
509 read3Count = vector()
|
|
510
|
|
511 if(appendTriplets){
|
9
|
512 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
513 }
|
|
514 for(iter in 1:length(product[,1])){
|
|
515 threshhold = product[iter,threshholdIndex]
|
|
516 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
517 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
518 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
519 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
520
|
30
|
521 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
522 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
523 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
524
|
30
|
525 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
526 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
527 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
528
|
18
|
529 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
530 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
531 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
532 res1 = append(res1, sum(one))
|
|
533 res2 = append(res2, sum(two))
|
|
534 res3 = append(res3, sum(three))
|
|
535 resAll = append(resAll, sum(all))
|
20
|
536 res12 = append(res12, sum(one_two))
|
|
537 res13 = append(res13, sum(one_three))
|
|
538 res23 = append(res23, sum(two_three))
|
7
|
539 #threshhold = 0
|
|
540 if(threshhold != 0){
|
|
541 if(sum(one) > 0){
|
20
|
542 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
543 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
544 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
545 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
546 }
|
|
547 if(sum(two) > 0){
|
20
|
548 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
549 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
550 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
551 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
552 }
|
|
553 if(sum(three) > 0){
|
20
|
554 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
555 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
556 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
557 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
558 }
|
20
|
559 if(sum(one_two) > 0){
|
|
560 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
561 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
562 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
563 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
564 }
|
|
565 if(sum(one_three) > 0){
|
|
566 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
567 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
568 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
569 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
570 }
|
|
571 if(sum(two_three) > 0){
|
|
572 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
573 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
574 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
575 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
576 }
|
|
577 } else { #scatterplot data
|
24
|
578 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
579 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
580 if(sum(in_two) > 0){
|
|
581 scatterplot_locus_data[in_two,]$type = "In two"
|
|
582 }
|
30
|
583 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
584 if(sum(in_three)> 0){
|
|
585 scatterplot_locus_data[in_three,]$type = "In three"
|
|
586 }
|
|
587 not_in_one = scatterplot_locus_data$type != "In one"
|
|
588 if(sum(not_in_one) > 0){
|
|
589 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
590 }
|
20
|
591 p = NULL
|
|
592 if(nrow(scatterplot_locus_data) != 0){
|
|
593 if(on == "normalized_read_count"){
|
31
|
594 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
595 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
596 } else {
|
32
|
597 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
598 }
|
|
599 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
600 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
601 } else {
|
|
602 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
603 }
|
|
604 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
605 print(p)
|
|
606 dev.off()
|
|
607 }
|
7
|
608 if(sum(all) > 0){
|
20
|
609 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
610 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
611 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
612 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
613 }
|
|
614 }
|
20
|
615 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
616 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
617 colnames(patientResult)[6] = oneSample
|
20
|
618 colnames(patientResult)[7] = twoSample
|
|
619 colnames(patientResult)[8] = threeSample
|
|
620 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
621 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
622 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
623
|
|
624 colnamesBak = colnames(patientResult)
|
20
|
625 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
626 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
627 colnames(patientResult) = colnamesBak
|
|
628
|
|
629 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
630 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
631
|
|
632 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
633 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
634 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
635 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
636 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
637 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
638 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
639 print(plt)
|
|
640 dev.off()
|
|
641
|
|
642 fontSize = 4
|
|
643
|
|
644 bak = patientResult
|
|
645 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
646 patientResult$relativeValue = patientResult$value * 10
|
|
647 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
648 plt = ggplot(patientResult)
|
|
649 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
650 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
651 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
652 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
653 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
654 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
655 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
656 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
657 print(plt)
|
|
658 dev.off()
|
|
659 }
|
|
660
|
33
|
661 if(nrow(triplets) != 0){
|
|
662 triplets$uniqueID = "ID"
|
|
663
|
|
664 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
665 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
666 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
667
|
|
668 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
669 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
670 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
671
|
|
672 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
673
|
|
674 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
675 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
676 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
677
|
|
678 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
679 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
680
|
|
681 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
682
|
|
683 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
684
|
|
685 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
686
|
|
687 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
688
|
|
689 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
690
|
|
691 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
692
|
|
693 #remove duplicate V+J+CDR3, add together numerical values
|
|
694 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
695 Cell_Count=unique(.SD$Cell_Count),
|
|
696 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
697 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
698 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
699 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
700 Frequency=sum(.SD$Frequency),
|
|
701 normalized_read_count=sum(.SD$normalized_read_count),
|
|
702 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
703 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
704
|
|
705
|
|
706 interval = intervalReads
|
|
707 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
708 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
709
|
|
710 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
711 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
712 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
713 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
714
|
|
715 one = triplets[triplets$Sample == "16278_Left",]
|
|
716 two = triplets[triplets$Sample == "26402_Left",]
|
|
717 three = triplets[triplets$Sample == "26759_Left",]
|
|
718 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
719
|
|
720 one = triplets[triplets$Sample == "16278_Right",]
|
|
721 two = triplets[triplets$Sample == "26402_Right",]
|
|
722 three = triplets[triplets$Sample == "26759_Right",]
|
|
723 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
724
|
|
725
|
|
726 interval = intervalFreq
|
|
727 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
728 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
729
|
|
730 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
731 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
732 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
733 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
734
|
|
735 one = triplets[triplets$Sample == "16278_Left",]
|
|
736 two = triplets[triplets$Sample == "26402_Left",]
|
|
737 three = triplets[triplets$Sample == "26759_Left",]
|
|
738 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
739
|
|
740 one = triplets[triplets$Sample == "16278_Right",]
|
|
741 two = triplets[triplets$Sample == "26402_Right",]
|
|
742 three = triplets[triplets$Sample == "26759_Right",]
|
|
743 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
744 } else {
|
|
745 cat("", file="triplets.txt")
|
|
746 }
|