0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
37
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
|
34
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
173
|
|
174 while(nrow(patient.fuzzy) > 1){
|
|
175 first.merge = patient.fuzzy[1,"merge"]
|
|
176 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
177
|
|
178 merge.filter = first.merge == patient.fuzzy$merge
|
|
179
|
42
|
180 length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
|
181
|
43
|
182 sample.filter = patient.fuzzy[1,"Sample"] != patient.fuzzy$Sample
|
|
183
|
44
|
184 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
185 match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & sample.filter
|
34
|
186
|
45
|
187 if(sum(match.filter) == 1){
|
|
188 second.match = which(match.filter)[1]
|
37
|
189 second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
|
|
190 first.sample = patient.fuzzy[1,"Sample"]
|
|
191 second.sample = patient.fuzzy[second.match,"Sample"]
|
34
|
192
|
37
|
193 if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
|
|
194 first.match.row = patient.fuzzy[1,]
|
|
195 second.match.row = patient.fuzzy[second.match,]
|
|
196 print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
|
|
197 patientMerge.new.row = data.frame(merge=first.clone.sequence,
|
|
198 min_cell_paste.x=first.match.row[1,"min_cell_paste"],
|
|
199 Patient.x=first.match.row[1,"Patient"],
|
|
200 Receptor.x=first.match.row[1,"Receptor"],
|
|
201 Sample.x=first.match.row[1,"Sample"],
|
|
202 Cell_Count.x=first.match.row[1,"Cell_Count"],
|
|
203 Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
204 Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
|
|
205 Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
|
|
206 dsPerM.x=first.match.row[1,"dsPerM"],
|
|
207 J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
|
|
208 V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
|
|
209 Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
|
|
210 CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
|
|
211 Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
|
|
212 Frequency.x=first.match.row[1,"Frequency"],
|
|
213 locus_V.x=first.match.row[1,"locus_V"],
|
|
214 locus_J.x=first.match.row[1,"locus_J"],
|
|
215 min_cell_count.x=first.match.row[1,"min_cell_count"],
|
|
216 normalized_read_count.x=first.match.row[1,"normalized_read_count"],
|
|
217 paste.x=first.match.row[1,"paste"],
|
|
218 min_cell_paste.y=second.match.row[1,"min_cell_paste"],
|
|
219 Patient.y=second.match.row[1,"Patient"],
|
|
220 Receptor.y=second.match.row[1,"Receptor"],
|
|
221 Sample.y=second.match.row[1,"Sample"],
|
|
222 Cell_Count.y=second.match.row[1,"Cell_Count"],
|
|
223 Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
224 Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
|
|
225 Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
|
|
226 dsPerM.y=second.match.row[1,"dsPerM"],
|
|
227 J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
|
|
228 V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
|
|
229 Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
|
|
230 CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
|
|
231 Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
|
|
232 Frequency.y=second.match.row[1,"Frequency"],
|
|
233 locus_V.y=second.match.row[1,"locus_V"],
|
|
234 locus_J.y=second.match.row[1,"locus_J"],
|
|
235 min_cell_count.y=second.match.row[1,"min_cell_count"],
|
|
236 normalized_read_count.y=second.match.row[1,"normalized_read_count"],
|
|
237 paste.y=first.match.row[1,"paste"])
|
|
238
|
|
239
|
|
240 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
241 patient.fuzzy = patient.fuzzy[-match.filter,]
|
|
242
|
|
243 patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
244 patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
245
|
|
246 scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
|
|
247
|
|
248 } else {
|
|
249 patient.fuzzy = patient.fuzzy[-1,]
|
|
250 }
|
|
251
|
46
|
252 } else if (sum(match.filter) > 1){
|
47
|
253 cat(paste("<tr><td>", "Multiple matches (", sum(match.filter), ") found for", first.merge, "in", patient, "</td></tr>", sep=" "), file=logfile, append=T)
|
39
|
254 patient.fuzzy = patient.fuzzy[-1,]
|
37
|
255 } else {
|
|
256 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
257 }
|
37
|
258
|
|
259
|
34
|
260 }
|
|
261
|
|
262 }
|
|
263
|
37
|
264
|
18
|
265 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
266 res1 = vector()
|
|
267 res2 = vector()
|
|
268 resBoth = vector()
|
|
269 read1Count = vector()
|
|
270 read2Count = vector()
|
2
|
271 locussum1 = vector()
|
|
272 locussum2 = vector()
|
9
|
273
|
0
|
274 #for(iter in 1){
|
|
275 for(iter in 1:length(product[,1])){
|
|
276 threshhold = product[iter,threshholdIndex]
|
|
277 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
278 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
279 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
280 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
281 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
282 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
283 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
284 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
285 res1 = append(res1, sum(one))
|
2
|
286 res2 = append(res2, sum(two))
|
0
|
287 resBoth = append(resBoth, sum(both))
|
2
|
288 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
289 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
290 #threshhold = 0
|
|
291 if(threshhold != 0){
|
|
292 if(sum(one) > 0){
|
15
|
293 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
294 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
295 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
296 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
297 }
|
|
298 if(sum(two) > 0){
|
15
|
299 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
300 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
301 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
302 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
303 }
|
29
|
304 } else {
|
|
305 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
306 if(nrow(scatterplot_locus_data) > 0){
|
|
307 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
308 }
|
34
|
309 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
310 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
311 not_in_one = !in_one
|
|
312 if(any(in_two)){
|
|
313 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
314 }
|
30
|
315 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
316 if(any(in_both)){
|
|
317 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
318 }
|
29
|
319 if(type == "single"){
|
|
320 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
321 }
|
|
322 p = NULL
|
|
323 if(nrow(scatterplot_locus_data) != 0){
|
|
324 if(on == "normalized_read_count"){
|
30
|
325 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
38
|
326 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6)
|
29
|
327 } else {
|
30
|
328 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
329 }
|
|
330 p = p + geom_point(aes(colour=type), position="jitter")
|
|
331 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
332 } else {
|
|
333 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
334 }
|
|
335 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
336 print(p)
|
|
337 dev.off()
|
0
|
338 }
|
|
339 if(sum(both) > 0){
|
15
|
340 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
341 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
342 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
343 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
344 }
|
0
|
345 }
|
2
|
346 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
347 if(sum(is.na(patientResult$percentage)) > 0){
|
|
348 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
349 }
|
|
350 colnames(patientResult)[6] = oneSample
|
|
351 colnames(patientResult)[8] = twoSample
|
|
352 colnamesBak = colnames(patientResult)
|
2
|
353 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
354 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
355 colnames(patientResult) = colnamesBak
|
|
356
|
|
357 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
358 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
359
|
|
360 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
361 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
362 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
363 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
364 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
365 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
366 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
367 print(plt)
|
|
368 dev.off()
|
|
369 #(t,r,b,l)
|
|
370 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
371 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
372 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
373 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
374 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
375 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
376 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
377 print(plt)
|
|
378 dev.off()
|
|
379
|
|
380 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
381 patientResult$relativeValue = patientResult$value * 10
|
|
382 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
383 plt = ggplot(patientResult)
|
|
384 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
385 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
386 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
387 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
388 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
389 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
390 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
391 print(plt)
|
|
392 dev.off()
|
|
393 }
|
|
394
|
3
|
395 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
396
|
0
|
397 interval = intervalFreq
|
|
398 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
399 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
400 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
401
|
3
|
402 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
403
|
0
|
404 interval = intervalReads
|
|
405 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
406 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
407 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
408
|
3
|
409 cat("</table></html>", file=logfile, append=T)
|
|
410
|
33
|
411
|
|
412 if(nrow(single_patients) > 0){
|
|
413 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
414 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
415 p = p + geom_point(aes(colour=type), position="jitter")
|
|
416 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
417 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
418 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
419 print(p)
|
|
420 dev.off()
|
7
|
421
|
33
|
422 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
423 p = p + geom_point(aes(colour=type), position="jitter")
|
|
424 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
425 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
426 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
427 print(p)
|
|
428 dev.off()
|
|
429 } else {
|
|
430 empty <- data.frame()
|
|
431 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
432
|
|
433 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
434 print(p)
|
|
435 dev.off()
|
|
436
|
|
437 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
438 print(p)
|
|
439 dev.off()
|
|
440 }
|
7
|
441 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
442 onShort = "reads"
|
|
443 if(on == "Frequency"){
|
|
444 onShort = "freq"
|
|
445 }
|
18
|
446 onx = paste(on, ".x", sep="")
|
|
447 ony = paste(on, ".y", sep="")
|
|
448 onz = paste(on, ".z", sep="")
|
7
|
449 type="triplet"
|
|
450
|
|
451 threshholdIndex = which(colnames(product) == "interval")
|
|
452 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
453 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
454 titleIndex = which(colnames(product) == "Titles")
|
|
455 sampleIndex = which(colnames(patient1) == "Sample")
|
|
456 patientIndex = which(colnames(patient1) == "Patient")
|
|
457 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
458 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
459 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
460
|
29
|
461 if(mergeOn == "Clone_Sequence"){
|
|
462 patient1$merge = paste(patient1$Clone_Sequence)
|
|
463 patient2$merge = paste(patient2$Clone_Sequence)
|
|
464 patient3$merge = paste(patient3$Clone_Sequence)
|
|
465
|
|
466 } else {
|
|
467 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
468 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
469 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
470 }
|
9
|
471
|
|
472 patientMerge = merge(patient1, patient2, by="merge")
|
|
473 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
474 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
475 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
476 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
477 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
478 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
479 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
480 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
481 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
482
|
28
|
483
|
30
|
484 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
485 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
486 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
487 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
488
|
7
|
489 res1 = vector()
|
|
490 res2 = vector()
|
|
491 res3 = vector()
|
20
|
492 res12 = vector()
|
|
493 res13 = vector()
|
|
494 res23 = vector()
|
7
|
495 resAll = vector()
|
|
496 read1Count = vector()
|
|
497 read2Count = vector()
|
|
498 read3Count = vector()
|
|
499
|
|
500 if(appendTriplets){
|
9
|
501 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
502 }
|
|
503 for(iter in 1:length(product[,1])){
|
|
504 threshhold = product[iter,threshholdIndex]
|
|
505 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
506 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
507 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
508 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
509
|
30
|
510 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
511 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
512 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
513
|
30
|
514 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
515 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
516 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
517
|
18
|
518 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
519 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
520 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
521 res1 = append(res1, sum(one))
|
|
522 res2 = append(res2, sum(two))
|
|
523 res3 = append(res3, sum(three))
|
|
524 resAll = append(resAll, sum(all))
|
20
|
525 res12 = append(res12, sum(one_two))
|
|
526 res13 = append(res13, sum(one_three))
|
|
527 res23 = append(res23, sum(two_three))
|
7
|
528 #threshhold = 0
|
|
529 if(threshhold != 0){
|
|
530 if(sum(one) > 0){
|
20
|
531 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
532 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
533 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
534 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
535 }
|
|
536 if(sum(two) > 0){
|
20
|
537 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
538 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
539 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
540 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
541 }
|
|
542 if(sum(three) > 0){
|
20
|
543 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
544 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
545 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
546 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
547 }
|
20
|
548 if(sum(one_two) > 0){
|
|
549 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
550 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
551 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
552 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
553 }
|
|
554 if(sum(one_three) > 0){
|
|
555 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
556 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
557 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
558 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
559 }
|
|
560 if(sum(two_three) > 0){
|
|
561 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
562 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
563 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
564 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
565 }
|
|
566 } else { #scatterplot data
|
24
|
567 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
568 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
569 if(sum(in_two) > 0){
|
|
570 scatterplot_locus_data[in_two,]$type = "In two"
|
|
571 }
|
30
|
572 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
573 if(sum(in_three)> 0){
|
|
574 scatterplot_locus_data[in_three,]$type = "In three"
|
|
575 }
|
|
576 not_in_one = scatterplot_locus_data$type != "In one"
|
|
577 if(sum(not_in_one) > 0){
|
|
578 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
579 }
|
20
|
580 p = NULL
|
|
581 if(nrow(scatterplot_locus_data) != 0){
|
|
582 if(on == "normalized_read_count"){
|
31
|
583 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
584 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
585 } else {
|
32
|
586 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
587 }
|
|
588 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
589 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
590 } else {
|
|
591 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
592 }
|
|
593 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
594 print(p)
|
|
595 dev.off()
|
|
596 }
|
7
|
597 if(sum(all) > 0){
|
20
|
598 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
599 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
600 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
601 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
602 }
|
|
603 }
|
20
|
604 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
605 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
606 colnames(patientResult)[6] = oneSample
|
20
|
607 colnames(patientResult)[7] = twoSample
|
|
608 colnames(patientResult)[8] = threeSample
|
|
609 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
610 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
611 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
612
|
|
613 colnamesBak = colnames(patientResult)
|
20
|
614 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
615 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
616 colnames(patientResult) = colnamesBak
|
|
617
|
|
618 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
619 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
620
|
|
621 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
622 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
623 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
624 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
625 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
626 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
627 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
628 print(plt)
|
|
629 dev.off()
|
|
630
|
|
631 fontSize = 4
|
|
632
|
|
633 bak = patientResult
|
|
634 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
635 patientResult$relativeValue = patientResult$value * 10
|
|
636 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
637 plt = ggplot(patientResult)
|
|
638 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
639 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
640 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
641 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
642 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
643 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
644 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
645 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
646 print(plt)
|
|
647 dev.off()
|
|
648 }
|
|
649
|
33
|
650 if(nrow(triplets) != 0){
|
|
651 triplets$uniqueID = "ID"
|
|
652
|
|
653 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
654 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
655 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
656
|
|
657 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
658 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
659 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
660
|
|
661 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
662
|
|
663 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
664 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
665 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
666
|
|
667 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
668 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
669
|
|
670 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
671
|
|
672 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
673
|
|
674 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
675
|
|
676 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
677
|
|
678 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
679
|
|
680 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
681
|
|
682 #remove duplicate V+J+CDR3, add together numerical values
|
|
683 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
684 Cell_Count=unique(.SD$Cell_Count),
|
|
685 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
686 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
687 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
688 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
689 Frequency=sum(.SD$Frequency),
|
|
690 normalized_read_count=sum(.SD$normalized_read_count),
|
|
691 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
692 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
693
|
|
694
|
|
695 interval = intervalReads
|
|
696 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
697 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
698
|
|
699 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
700 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
701 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
702 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
703
|
|
704 one = triplets[triplets$Sample == "16278_Left",]
|
|
705 two = triplets[triplets$Sample == "26402_Left",]
|
|
706 three = triplets[triplets$Sample == "26759_Left",]
|
|
707 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
708
|
|
709 one = triplets[triplets$Sample == "16278_Right",]
|
|
710 two = triplets[triplets$Sample == "26402_Right",]
|
|
711 three = triplets[triplets$Sample == "26759_Right",]
|
|
712 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
713
|
|
714
|
|
715 interval = intervalFreq
|
|
716 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
717 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
718
|
|
719 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
720 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
721 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
722 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
723
|
|
724 one = triplets[triplets$Sample == "16278_Left",]
|
|
725 two = triplets[triplets$Sample == "26402_Left",]
|
|
726 three = triplets[triplets$Sample == "26759_Left",]
|
|
727 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
728
|
|
729 one = triplets[triplets$Sample == "16278_Right",]
|
|
730 two = triplets[triplets$Sample == "26402_Right",]
|
|
731 three = triplets[triplets$Sample == "26759_Right",]
|
|
732 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
733 } else {
|
|
734 cat("", file="triplets.txt")
|
|
735 }
|