0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
37
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
|
34
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
173
|
|
174 while(nrow(patient.fuzzy) > 1){
|
|
175 first.merge = patient.fuzzy[1,"merge"]
|
|
176 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
177
|
|
178 merge.filter = first.merge == patient.fuzzy$merge
|
|
179
|
|
180 match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence)
|
34
|
181
|
37
|
182 if(sum(match.filter) == 2){
|
|
183 second.match = which(match.filter)[2]
|
|
184 second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
|
|
185 first.sample = patient.fuzzy[1,"Sample"]
|
|
186 second.sample = patient.fuzzy[second.match,"Sample"]
|
34
|
187
|
37
|
188 if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
|
|
189 first.match.row = patient.fuzzy[1,]
|
|
190 second.match.row = patient.fuzzy[second.match,]
|
|
191 print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
|
|
192 patientMerge.new.row = data.frame(merge=first.clone.sequence,
|
|
193 min_cell_paste.x=first.match.row[1,"min_cell_paste"],
|
|
194 Patient.x=first.match.row[1,"Patient"],
|
|
195 Receptor.x=first.match.row[1,"Receptor"],
|
|
196 Sample.x=first.match.row[1,"Sample"],
|
|
197 Cell_Count.x=first.match.row[1,"Cell_Count"],
|
|
198 Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
199 Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
|
|
200 Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
|
|
201 dsPerM.x=first.match.row[1,"dsPerM"],
|
|
202 J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
|
|
203 V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
|
|
204 Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
|
|
205 CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
|
|
206 Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
|
|
207 Frequency.x=first.match.row[1,"Frequency"],
|
|
208 locus_V.x=first.match.row[1,"locus_V"],
|
|
209 locus_J.x=first.match.row[1,"locus_J"],
|
|
210 min_cell_count.x=first.match.row[1,"min_cell_count"],
|
|
211 normalized_read_count.x=first.match.row[1,"normalized_read_count"],
|
|
212 paste.x=first.match.row[1,"paste"],
|
|
213 min_cell_paste.y=second.match.row[1,"min_cell_paste"],
|
|
214 Patient.y=second.match.row[1,"Patient"],
|
|
215 Receptor.y=second.match.row[1,"Receptor"],
|
|
216 Sample.y=second.match.row[1,"Sample"],
|
|
217 Cell_Count.y=second.match.row[1,"Cell_Count"],
|
|
218 Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
219 Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
|
|
220 Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
|
|
221 dsPerM.y=second.match.row[1,"dsPerM"],
|
|
222 J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
|
|
223 V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
|
|
224 Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
|
|
225 CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
|
|
226 Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
|
|
227 Frequency.y=second.match.row[1,"Frequency"],
|
|
228 locus_V.y=second.match.row[1,"locus_V"],
|
|
229 locus_J.y=second.match.row[1,"locus_J"],
|
|
230 min_cell_count.y=second.match.row[1,"min_cell_count"],
|
|
231 normalized_read_count.y=second.match.row[1,"normalized_read_count"],
|
|
232 paste.y=first.match.row[1,"paste"])
|
|
233
|
|
234
|
|
235 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
236 patient.fuzzy = patient.fuzzy[-match.filter,]
|
|
237
|
|
238 patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
239 patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
240
|
|
241 scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
|
|
242
|
|
243 } else {
|
|
244 patient.fuzzy = patient.fuzzy[-1,]
|
|
245 }
|
|
246
|
|
247 } else if (sum(match.filter) > 2){
|
40
|
248 cat(paste("Multiple matches found for", first.merge, "in", patient), file=logfile, append=T)
|
39
|
249 patient.fuzzy = patient.fuzzy[-1,]
|
37
|
250 } else {
|
|
251 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
252 }
|
37
|
253
|
|
254
|
34
|
255 }
|
|
256
|
|
257 }
|
|
258
|
37
|
259
|
18
|
260 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
261 res1 = vector()
|
|
262 res2 = vector()
|
|
263 resBoth = vector()
|
|
264 read1Count = vector()
|
|
265 read2Count = vector()
|
2
|
266 locussum1 = vector()
|
|
267 locussum2 = vector()
|
9
|
268
|
0
|
269 #for(iter in 1){
|
|
270 for(iter in 1:length(product[,1])){
|
|
271 threshhold = product[iter,threshholdIndex]
|
|
272 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
273 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
274 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
275 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
276 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
277 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
278 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
279 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
280 res1 = append(res1, sum(one))
|
2
|
281 res2 = append(res2, sum(two))
|
0
|
282 resBoth = append(resBoth, sum(both))
|
2
|
283 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
284 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
285 #threshhold = 0
|
|
286 if(threshhold != 0){
|
|
287 if(sum(one) > 0){
|
15
|
288 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
289 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
290 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
291 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
292 }
|
|
293 if(sum(two) > 0){
|
15
|
294 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
295 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
296 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
297 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
298 }
|
29
|
299 } else {
|
|
300 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
301 if(nrow(scatterplot_locus_data) > 0){
|
|
302 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
303 }
|
34
|
304 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
305 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
306 not_in_one = !in_one
|
|
307 if(any(in_two)){
|
|
308 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
309 }
|
30
|
310 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
311 if(any(in_both)){
|
|
312 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
313 }
|
29
|
314 if(type == "single"){
|
|
315 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
316 }
|
|
317 p = NULL
|
|
318 if(nrow(scatterplot_locus_data) != 0){
|
|
319 if(on == "normalized_read_count"){
|
30
|
320 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
38
|
321 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6)
|
29
|
322 } else {
|
30
|
323 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
324 }
|
|
325 p = p + geom_point(aes(colour=type), position="jitter")
|
|
326 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
327 } else {
|
|
328 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
329 }
|
|
330 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
331 print(p)
|
|
332 dev.off()
|
0
|
333 }
|
|
334 if(sum(both) > 0){
|
15
|
335 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
336 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
337 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
338 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
339 }
|
0
|
340 }
|
2
|
341 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
342 if(sum(is.na(patientResult$percentage)) > 0){
|
|
343 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
344 }
|
|
345 colnames(patientResult)[6] = oneSample
|
|
346 colnames(patientResult)[8] = twoSample
|
|
347 colnamesBak = colnames(patientResult)
|
2
|
348 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
349 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
350 colnames(patientResult) = colnamesBak
|
|
351
|
|
352 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
353 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
354
|
|
355 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
356 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
357 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
358 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
359 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
360 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
361 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
362 print(plt)
|
|
363 dev.off()
|
|
364 #(t,r,b,l)
|
|
365 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
366 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
367 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
368 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
369 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
370 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
371 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
372 print(plt)
|
|
373 dev.off()
|
|
374
|
|
375 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
376 patientResult$relativeValue = patientResult$value * 10
|
|
377 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
378 plt = ggplot(patientResult)
|
|
379 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
380 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
381 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
382 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
383 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
384 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
385 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
386 print(plt)
|
|
387 dev.off()
|
|
388 }
|
|
389
|
3
|
390 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
391
|
0
|
392 interval = intervalFreq
|
|
393 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
394 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
395 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
396
|
3
|
397 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
398
|
0
|
399 interval = intervalReads
|
|
400 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
401 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
402 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
403
|
3
|
404 cat("</table></html>", file=logfile, append=T)
|
|
405
|
33
|
406
|
|
407 if(nrow(single_patients) > 0){
|
|
408 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
409 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
410 p = p + geom_point(aes(colour=type), position="jitter")
|
|
411 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
412 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
413 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
414 print(p)
|
|
415 dev.off()
|
7
|
416
|
33
|
417 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
418 p = p + geom_point(aes(colour=type), position="jitter")
|
|
419 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
420 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
421 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
422 print(p)
|
|
423 dev.off()
|
|
424 } else {
|
|
425 empty <- data.frame()
|
|
426 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
427
|
|
428 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
429 print(p)
|
|
430 dev.off()
|
|
431
|
|
432 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
433 print(p)
|
|
434 dev.off()
|
|
435 }
|
7
|
436 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
437 onShort = "reads"
|
|
438 if(on == "Frequency"){
|
|
439 onShort = "freq"
|
|
440 }
|
18
|
441 onx = paste(on, ".x", sep="")
|
|
442 ony = paste(on, ".y", sep="")
|
|
443 onz = paste(on, ".z", sep="")
|
7
|
444 type="triplet"
|
|
445
|
|
446 threshholdIndex = which(colnames(product) == "interval")
|
|
447 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
448 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
449 titleIndex = which(colnames(product) == "Titles")
|
|
450 sampleIndex = which(colnames(patient1) == "Sample")
|
|
451 patientIndex = which(colnames(patient1) == "Patient")
|
|
452 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
453 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
454 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
455
|
29
|
456 if(mergeOn == "Clone_Sequence"){
|
|
457 patient1$merge = paste(patient1$Clone_Sequence)
|
|
458 patient2$merge = paste(patient2$Clone_Sequence)
|
|
459 patient3$merge = paste(patient3$Clone_Sequence)
|
|
460
|
|
461 } else {
|
|
462 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
463 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
464 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
465 }
|
9
|
466
|
|
467 patientMerge = merge(patient1, patient2, by="merge")
|
|
468 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
469 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
470 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
471 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
472 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
473 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
474 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
475 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
476 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
477
|
28
|
478
|
30
|
479 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
480 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
481 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
482 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
483
|
7
|
484 res1 = vector()
|
|
485 res2 = vector()
|
|
486 res3 = vector()
|
20
|
487 res12 = vector()
|
|
488 res13 = vector()
|
|
489 res23 = vector()
|
7
|
490 resAll = vector()
|
|
491 read1Count = vector()
|
|
492 read2Count = vector()
|
|
493 read3Count = vector()
|
|
494
|
|
495 if(appendTriplets){
|
9
|
496 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
497 }
|
|
498 for(iter in 1:length(product[,1])){
|
|
499 threshhold = product[iter,threshholdIndex]
|
|
500 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
501 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
502 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
503 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
504
|
30
|
505 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
506 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
507 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
508
|
30
|
509 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
510 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
511 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
512
|
18
|
513 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
514 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
515 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
516 res1 = append(res1, sum(one))
|
|
517 res2 = append(res2, sum(two))
|
|
518 res3 = append(res3, sum(three))
|
|
519 resAll = append(resAll, sum(all))
|
20
|
520 res12 = append(res12, sum(one_two))
|
|
521 res13 = append(res13, sum(one_three))
|
|
522 res23 = append(res23, sum(two_three))
|
7
|
523 #threshhold = 0
|
|
524 if(threshhold != 0){
|
|
525 if(sum(one) > 0){
|
20
|
526 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
527 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
528 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
529 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
530 }
|
|
531 if(sum(two) > 0){
|
20
|
532 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
533 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
534 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
535 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
536 }
|
|
537 if(sum(three) > 0){
|
20
|
538 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
539 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
540 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
541 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
542 }
|
20
|
543 if(sum(one_two) > 0){
|
|
544 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
545 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
546 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
547 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
548 }
|
|
549 if(sum(one_three) > 0){
|
|
550 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
551 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
552 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
553 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
554 }
|
|
555 if(sum(two_three) > 0){
|
|
556 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
557 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
558 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
559 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
560 }
|
|
561 } else { #scatterplot data
|
24
|
562 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
563 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
564 if(sum(in_two) > 0){
|
|
565 scatterplot_locus_data[in_two,]$type = "In two"
|
|
566 }
|
30
|
567 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
568 if(sum(in_three)> 0){
|
|
569 scatterplot_locus_data[in_three,]$type = "In three"
|
|
570 }
|
|
571 not_in_one = scatterplot_locus_data$type != "In one"
|
|
572 if(sum(not_in_one) > 0){
|
|
573 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
574 }
|
20
|
575 p = NULL
|
|
576 if(nrow(scatterplot_locus_data) != 0){
|
|
577 if(on == "normalized_read_count"){
|
31
|
578 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
579 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
580 } else {
|
32
|
581 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
582 }
|
|
583 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
584 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
585 } else {
|
|
586 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
587 }
|
|
588 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
589 print(p)
|
|
590 dev.off()
|
|
591 }
|
7
|
592 if(sum(all) > 0){
|
20
|
593 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
594 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
595 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
596 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
597 }
|
|
598 }
|
20
|
599 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
600 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
601 colnames(patientResult)[6] = oneSample
|
20
|
602 colnames(patientResult)[7] = twoSample
|
|
603 colnames(patientResult)[8] = threeSample
|
|
604 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
605 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
606 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
607
|
|
608 colnamesBak = colnames(patientResult)
|
20
|
609 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
610 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
611 colnames(patientResult) = colnamesBak
|
|
612
|
|
613 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
614 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
615
|
|
616 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
617 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
618 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
619 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
620 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
621 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
622 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
623 print(plt)
|
|
624 dev.off()
|
|
625
|
|
626 fontSize = 4
|
|
627
|
|
628 bak = patientResult
|
|
629 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
630 patientResult$relativeValue = patientResult$value * 10
|
|
631 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
632 plt = ggplot(patientResult)
|
|
633 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
634 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
635 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
636 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
637 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
638 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
639 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
640 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
641 print(plt)
|
|
642 dev.off()
|
|
643 }
|
|
644
|
33
|
645 if(nrow(triplets) != 0){
|
|
646 triplets$uniqueID = "ID"
|
|
647
|
|
648 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
649 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
650 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
651
|
|
652 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
653 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
654 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
655
|
|
656 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
657
|
|
658 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
659 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
660 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
661
|
|
662 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
663 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
664
|
|
665 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
666
|
|
667 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
668
|
|
669 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
670
|
|
671 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
672
|
|
673 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
674
|
|
675 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
676
|
|
677 #remove duplicate V+J+CDR3, add together numerical values
|
|
678 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
679 Cell_Count=unique(.SD$Cell_Count),
|
|
680 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
681 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
682 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
683 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
684 Frequency=sum(.SD$Frequency),
|
|
685 normalized_read_count=sum(.SD$normalized_read_count),
|
|
686 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
687 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
688
|
|
689
|
|
690 interval = intervalReads
|
|
691 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
692 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
693
|
|
694 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
695 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
696 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
697 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
698
|
|
699 one = triplets[triplets$Sample == "16278_Left",]
|
|
700 two = triplets[triplets$Sample == "26402_Left",]
|
|
701 three = triplets[triplets$Sample == "26759_Left",]
|
|
702 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
703
|
|
704 one = triplets[triplets$Sample == "16278_Right",]
|
|
705 two = triplets[triplets$Sample == "26402_Right",]
|
|
706 three = triplets[triplets$Sample == "26759_Right",]
|
|
707 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
708
|
|
709
|
|
710 interval = intervalFreq
|
|
711 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
712 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
713
|
|
714 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
715 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
716 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
717 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
718
|
|
719 one = triplets[triplets$Sample == "16278_Left",]
|
|
720 two = triplets[triplets$Sample == "26402_Left",]
|
|
721 three = triplets[triplets$Sample == "26759_Left",]
|
|
722 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
723
|
|
724 one = triplets[triplets$Sample == "16278_Right",]
|
|
725 two = triplets[triplets$Sample == "26402_Right",]
|
|
726 three = triplets[triplets$Sample == "26759_Right",]
|
|
727 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
728 } else {
|
|
729 cat("", file="triplets.txt")
|
|
730 }
|