0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
37
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
|
34
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
173
|
|
174 while(nrow(patient.fuzzy) > 1){
|
|
175 first.merge = patient.fuzzy[1,"merge"]
|
|
176 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
177
|
|
178 merge.filter = first.merge == patient.fuzzy$merge
|
|
179
|
|
180 match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence)
|
34
|
181
|
37
|
182 if(sum(match.filter) == 2){
|
|
183 second.match = which(match.filter)[2]
|
|
184 second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
|
|
185 first.sample = patient.fuzzy[1,"Sample"]
|
|
186 second.sample = patient.fuzzy[second.match,"Sample"]
|
34
|
187
|
37
|
188 if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
|
|
189 first.match.row = patient.fuzzy[1,]
|
|
190 second.match.row = patient.fuzzy[second.match,]
|
|
191 print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
|
|
192 patientMerge.new.row = data.frame(merge=first.clone.sequence,
|
|
193 min_cell_paste.x=first.match.row[1,"min_cell_paste"],
|
|
194 Patient.x=first.match.row[1,"Patient"],
|
|
195 Receptor.x=first.match.row[1,"Receptor"],
|
|
196 Sample.x=first.match.row[1,"Sample"],
|
|
197 Cell_Count.x=first.match.row[1,"Cell_Count"],
|
|
198 Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
199 Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
|
|
200 Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
|
|
201 dsPerM.x=first.match.row[1,"dsPerM"],
|
|
202 J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
|
|
203 V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
|
|
204 Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
|
|
205 CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
|
|
206 Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
|
|
207 Frequency.x=first.match.row[1,"Frequency"],
|
|
208 locus_V.x=first.match.row[1,"locus_V"],
|
|
209 locus_J.x=first.match.row[1,"locus_J"],
|
|
210 min_cell_count.x=first.match.row[1,"min_cell_count"],
|
|
211 normalized_read_count.x=first.match.row[1,"normalized_read_count"],
|
|
212 paste.x=first.match.row[1,"paste"],
|
|
213 min_cell_paste.y=second.match.row[1,"min_cell_paste"],
|
|
214 Patient.y=second.match.row[1,"Patient"],
|
|
215 Receptor.y=second.match.row[1,"Receptor"],
|
|
216 Sample.y=second.match.row[1,"Sample"],
|
|
217 Cell_Count.y=second.match.row[1,"Cell_Count"],
|
|
218 Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
219 Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
|
|
220 Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
|
|
221 dsPerM.y=second.match.row[1,"dsPerM"],
|
|
222 J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
|
|
223 V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
|
|
224 Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
|
|
225 CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
|
|
226 Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
|
|
227 Frequency.y=second.match.row[1,"Frequency"],
|
|
228 locus_V.y=second.match.row[1,"locus_V"],
|
|
229 locus_J.y=second.match.row[1,"locus_J"],
|
|
230 min_cell_count.y=second.match.row[1,"min_cell_count"],
|
|
231 normalized_read_count.y=second.match.row[1,"normalized_read_count"],
|
|
232 paste.y=first.match.row[1,"paste"])
|
|
233
|
|
234
|
|
235 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
236 patient.fuzzy = patient.fuzzy[-match.filter,]
|
|
237
|
|
238 patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
239 patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
240
|
|
241 scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
|
|
242
|
|
243 } else {
|
|
244 patient.fuzzy = patient.fuzzy[-1,]
|
|
245 }
|
|
246
|
|
247 } else if (sum(match.filter) > 2){
|
|
248 print(paste("Multiple matches found for", first.merge, "in", patient))
|
|
249 } else {
|
|
250 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
251 }
|
37
|
252
|
|
253
|
34
|
254 }
|
|
255
|
|
256 }
|
|
257
|
37
|
258
|
18
|
259 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
260 res1 = vector()
|
|
261 res2 = vector()
|
|
262 resBoth = vector()
|
|
263 read1Count = vector()
|
|
264 read2Count = vector()
|
2
|
265 locussum1 = vector()
|
|
266 locussum2 = vector()
|
9
|
267
|
0
|
268 #for(iter in 1){
|
|
269 for(iter in 1:length(product[,1])){
|
|
270 threshhold = product[iter,threshholdIndex]
|
|
271 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
272 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
273 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
274 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
275 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
276 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
277 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
278 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
279 res1 = append(res1, sum(one))
|
2
|
280 res2 = append(res2, sum(two))
|
0
|
281 resBoth = append(resBoth, sum(both))
|
2
|
282 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
283 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
284 #threshhold = 0
|
|
285 if(threshhold != 0){
|
|
286 if(sum(one) > 0){
|
15
|
287 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
288 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
289 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
290 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
291 }
|
|
292 if(sum(two) > 0){
|
15
|
293 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
294 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
295 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
296 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
297 }
|
29
|
298 } else {
|
|
299 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
300 if(nrow(scatterplot_locus_data) > 0){
|
|
301 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
302 }
|
34
|
303 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
304 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
305 not_in_one = !in_one
|
|
306 if(any(in_two)){
|
|
307 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
308 }
|
30
|
309 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
310 if(any(in_both)){
|
|
311 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
312 }
|
29
|
313 if(type == "single"){
|
|
314 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
315 }
|
|
316 p = NULL
|
|
317 if(nrow(scatterplot_locus_data) != 0){
|
|
318 if(on == "normalized_read_count"){
|
30
|
319 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
34
|
320 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
|
29
|
321 } else {
|
30
|
322 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
323 }
|
|
324 p = p + geom_point(aes(colour=type), position="jitter")
|
|
325 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
326 } else {
|
|
327 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
328 }
|
|
329 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
330 print(p)
|
|
331 dev.off()
|
0
|
332 }
|
|
333 if(sum(both) > 0){
|
15
|
334 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
335 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
336 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
337 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
338 }
|
0
|
339 }
|
2
|
340 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
341 if(sum(is.na(patientResult$percentage)) > 0){
|
|
342 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
343 }
|
|
344 colnames(patientResult)[6] = oneSample
|
|
345 colnames(patientResult)[8] = twoSample
|
|
346 colnamesBak = colnames(patientResult)
|
2
|
347 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
348 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
349 colnames(patientResult) = colnamesBak
|
|
350
|
|
351 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
352 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
353
|
|
354 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
355 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
356 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
357 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
358 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
359 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
360 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
361 print(plt)
|
|
362 dev.off()
|
|
363 #(t,r,b,l)
|
|
364 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
365 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
366 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
367 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
368 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
369 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
370 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
371 print(plt)
|
|
372 dev.off()
|
|
373
|
|
374 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
375 patientResult$relativeValue = patientResult$value * 10
|
|
376 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
377 plt = ggplot(patientResult)
|
|
378 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
379 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
380 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
381 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
382 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
383 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
384 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
385 print(plt)
|
|
386 dev.off()
|
|
387 }
|
|
388
|
3
|
389 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
390
|
0
|
391 interval = intervalFreq
|
|
392 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
393 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
394 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
395
|
3
|
396 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
397
|
0
|
398 interval = intervalReads
|
|
399 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
400 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
401 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
402
|
3
|
403 cat("</table></html>", file=logfile, append=T)
|
|
404
|
33
|
405
|
|
406 if(nrow(single_patients) > 0){
|
|
407 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
408 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
409 p = p + geom_point(aes(colour=type), position="jitter")
|
|
410 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
411 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
412 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
413 print(p)
|
|
414 dev.off()
|
7
|
415
|
33
|
416 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
417 p = p + geom_point(aes(colour=type), position="jitter")
|
|
418 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
419 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
420 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
421 print(p)
|
|
422 dev.off()
|
|
423 } else {
|
|
424 empty <- data.frame()
|
|
425 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
426
|
|
427 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
428 print(p)
|
|
429 dev.off()
|
|
430
|
|
431 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
432 print(p)
|
|
433 dev.off()
|
|
434 }
|
7
|
435 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
436 onShort = "reads"
|
|
437 if(on == "Frequency"){
|
|
438 onShort = "freq"
|
|
439 }
|
18
|
440 onx = paste(on, ".x", sep="")
|
|
441 ony = paste(on, ".y", sep="")
|
|
442 onz = paste(on, ".z", sep="")
|
7
|
443 type="triplet"
|
|
444
|
|
445 threshholdIndex = which(colnames(product) == "interval")
|
|
446 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
447 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
448 titleIndex = which(colnames(product) == "Titles")
|
|
449 sampleIndex = which(colnames(patient1) == "Sample")
|
|
450 patientIndex = which(colnames(patient1) == "Patient")
|
|
451 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
452 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
453 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
454
|
29
|
455 if(mergeOn == "Clone_Sequence"){
|
|
456 patient1$merge = paste(patient1$Clone_Sequence)
|
|
457 patient2$merge = paste(patient2$Clone_Sequence)
|
|
458 patient3$merge = paste(patient3$Clone_Sequence)
|
|
459
|
|
460 } else {
|
|
461 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
462 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
463 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
464 }
|
9
|
465
|
|
466 patientMerge = merge(patient1, patient2, by="merge")
|
|
467 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
468 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
469 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
470 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
471 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
472 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
473 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
474 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
475 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
476
|
28
|
477
|
30
|
478 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
479 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
480 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
481 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
482
|
7
|
483 res1 = vector()
|
|
484 res2 = vector()
|
|
485 res3 = vector()
|
20
|
486 res12 = vector()
|
|
487 res13 = vector()
|
|
488 res23 = vector()
|
7
|
489 resAll = vector()
|
|
490 read1Count = vector()
|
|
491 read2Count = vector()
|
|
492 read3Count = vector()
|
|
493
|
|
494 if(appendTriplets){
|
9
|
495 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
496 }
|
|
497 for(iter in 1:length(product[,1])){
|
|
498 threshhold = product[iter,threshholdIndex]
|
|
499 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
500 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
501 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
502 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
503
|
30
|
504 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
505 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
506 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
507
|
30
|
508 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
509 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
510 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
511
|
18
|
512 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
513 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
514 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
515 res1 = append(res1, sum(one))
|
|
516 res2 = append(res2, sum(two))
|
|
517 res3 = append(res3, sum(three))
|
|
518 resAll = append(resAll, sum(all))
|
20
|
519 res12 = append(res12, sum(one_two))
|
|
520 res13 = append(res13, sum(one_three))
|
|
521 res23 = append(res23, sum(two_three))
|
7
|
522 #threshhold = 0
|
|
523 if(threshhold != 0){
|
|
524 if(sum(one) > 0){
|
20
|
525 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
526 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
527 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
528 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
529 }
|
|
530 if(sum(two) > 0){
|
20
|
531 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
532 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
533 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
534 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
535 }
|
|
536 if(sum(three) > 0){
|
20
|
537 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
538 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
539 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
540 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
541 }
|
20
|
542 if(sum(one_two) > 0){
|
|
543 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
544 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
545 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
546 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
547 }
|
|
548 if(sum(one_three) > 0){
|
|
549 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
550 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
551 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
552 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
553 }
|
|
554 if(sum(two_three) > 0){
|
|
555 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
556 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
557 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
558 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
559 }
|
|
560 } else { #scatterplot data
|
24
|
561 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
562 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
563 if(sum(in_two) > 0){
|
|
564 scatterplot_locus_data[in_two,]$type = "In two"
|
|
565 }
|
30
|
566 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
567 if(sum(in_three)> 0){
|
|
568 scatterplot_locus_data[in_three,]$type = "In three"
|
|
569 }
|
|
570 not_in_one = scatterplot_locus_data$type != "In one"
|
|
571 if(sum(not_in_one) > 0){
|
|
572 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
573 }
|
20
|
574 p = NULL
|
|
575 if(nrow(scatterplot_locus_data) != 0){
|
|
576 if(on == "normalized_read_count"){
|
31
|
577 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
578 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
579 } else {
|
32
|
580 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
581 }
|
|
582 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
583 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
584 } else {
|
|
585 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
586 }
|
|
587 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
588 print(p)
|
|
589 dev.off()
|
|
590 }
|
7
|
591 if(sum(all) > 0){
|
20
|
592 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
593 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
594 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
595 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
596 }
|
|
597 }
|
20
|
598 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
599 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
600 colnames(patientResult)[6] = oneSample
|
20
|
601 colnames(patientResult)[7] = twoSample
|
|
602 colnames(patientResult)[8] = threeSample
|
|
603 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
604 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
605 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
606
|
|
607 colnamesBak = colnames(patientResult)
|
20
|
608 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
609 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
610 colnames(patientResult) = colnamesBak
|
|
611
|
|
612 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
613 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
614
|
|
615 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
616 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
617 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
618 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
619 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
620 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
621 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
622 print(plt)
|
|
623 dev.off()
|
|
624
|
|
625 fontSize = 4
|
|
626
|
|
627 bak = patientResult
|
|
628 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
629 patientResult$relativeValue = patientResult$value * 10
|
|
630 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
631 plt = ggplot(patientResult)
|
|
632 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
633 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
634 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
635 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
636 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
637 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
638 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
639 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
640 print(plt)
|
|
641 dev.off()
|
|
642 }
|
|
643
|
33
|
644 if(nrow(triplets) != 0){
|
|
645 triplets$uniqueID = "ID"
|
|
646
|
|
647 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
648 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
649 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
650
|
|
651 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
652 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
653 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
654
|
|
655 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
656
|
|
657 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
658 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
659 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
660
|
|
661 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
662 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
663
|
|
664 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
665
|
|
666 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
667
|
|
668 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
669
|
|
670 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
671
|
|
672 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
673
|
|
674 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
675
|
|
676 #remove duplicate V+J+CDR3, add together numerical values
|
|
677 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
678 Cell_Count=unique(.SD$Cell_Count),
|
|
679 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
680 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
681 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
682 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
683 Frequency=sum(.SD$Frequency),
|
|
684 normalized_read_count=sum(.SD$normalized_read_count),
|
|
685 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
686 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
687
|
|
688
|
|
689 interval = intervalReads
|
|
690 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
691 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
692
|
|
693 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
694 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
695 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
696 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
697
|
|
698 one = triplets[triplets$Sample == "16278_Left",]
|
|
699 two = triplets[triplets$Sample == "26402_Left",]
|
|
700 three = triplets[triplets$Sample == "26759_Left",]
|
|
701 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
702
|
|
703 one = triplets[triplets$Sample == "16278_Right",]
|
|
704 two = triplets[triplets$Sample == "26402_Right",]
|
|
705 three = triplets[triplets$Sample == "26759_Right",]
|
|
706 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
707
|
|
708
|
|
709 interval = intervalFreq
|
|
710 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
711 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
712
|
|
713 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
714 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
715 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
716 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
717
|
|
718 one = triplets[triplets$Sample == "16278_Left",]
|
|
719 two = triplets[triplets$Sample == "26402_Left",]
|
|
720 three = triplets[triplets$Sample == "26759_Left",]
|
|
721 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
722
|
|
723 one = triplets[triplets$Sample == "16278_Right",]
|
|
724 two = triplets[triplets$Sample == "26402_Right",]
|
|
725 three = triplets[triplets$Sample == "26759_Right",]
|
|
726 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
727 } else {
|
|
728 cat("", file="triplets.txt")
|
|
729 }
|