0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
37
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
|
34
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
173
|
|
174 while(nrow(patient.fuzzy) > 1){
|
|
175 first.merge = patient.fuzzy[1,"merge"]
|
|
176 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
177
|
|
178 merge.filter = first.merge == patient.fuzzy$merge
|
|
179
|
42
|
180 length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
|
181
|
|
182 match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter
|
34
|
183
|
37
|
184 if(sum(match.filter) == 2){
|
|
185 second.match = which(match.filter)[2]
|
|
186 second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
|
|
187 first.sample = patient.fuzzy[1,"Sample"]
|
|
188 second.sample = patient.fuzzy[second.match,"Sample"]
|
34
|
189
|
37
|
190 if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
|
|
191 first.match.row = patient.fuzzy[1,]
|
|
192 second.match.row = patient.fuzzy[second.match,]
|
|
193 print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
|
|
194 patientMerge.new.row = data.frame(merge=first.clone.sequence,
|
|
195 min_cell_paste.x=first.match.row[1,"min_cell_paste"],
|
|
196 Patient.x=first.match.row[1,"Patient"],
|
|
197 Receptor.x=first.match.row[1,"Receptor"],
|
|
198 Sample.x=first.match.row[1,"Sample"],
|
|
199 Cell_Count.x=first.match.row[1,"Cell_Count"],
|
|
200 Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
201 Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
|
|
202 Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
|
|
203 dsPerM.x=first.match.row[1,"dsPerM"],
|
|
204 J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
|
|
205 V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
|
|
206 Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
|
|
207 CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
|
|
208 Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
|
|
209 Frequency.x=first.match.row[1,"Frequency"],
|
|
210 locus_V.x=first.match.row[1,"locus_V"],
|
|
211 locus_J.x=first.match.row[1,"locus_J"],
|
|
212 min_cell_count.x=first.match.row[1,"min_cell_count"],
|
|
213 normalized_read_count.x=first.match.row[1,"normalized_read_count"],
|
|
214 paste.x=first.match.row[1,"paste"],
|
|
215 min_cell_paste.y=second.match.row[1,"min_cell_paste"],
|
|
216 Patient.y=second.match.row[1,"Patient"],
|
|
217 Receptor.y=second.match.row[1,"Receptor"],
|
|
218 Sample.y=second.match.row[1,"Sample"],
|
|
219 Cell_Count.y=second.match.row[1,"Cell_Count"],
|
|
220 Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
221 Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
|
|
222 Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
|
|
223 dsPerM.y=second.match.row[1,"dsPerM"],
|
|
224 J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
|
|
225 V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
|
|
226 Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
|
|
227 CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
|
|
228 Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
|
|
229 Frequency.y=second.match.row[1,"Frequency"],
|
|
230 locus_V.y=second.match.row[1,"locus_V"],
|
|
231 locus_J.y=second.match.row[1,"locus_J"],
|
|
232 min_cell_count.y=second.match.row[1,"min_cell_count"],
|
|
233 normalized_read_count.y=second.match.row[1,"normalized_read_count"],
|
|
234 paste.y=first.match.row[1,"paste"])
|
|
235
|
|
236
|
|
237 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
238 patient.fuzzy = patient.fuzzy[-match.filter,]
|
|
239
|
|
240 patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
241 patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
242
|
|
243 scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
|
|
244
|
|
245 } else {
|
|
246 patient.fuzzy = patient.fuzzy[-1,]
|
|
247 }
|
|
248
|
|
249 } else if (sum(match.filter) > 2){
|
41
|
250 cat(paste("<tr><td>", "Multiple matches found for", first.merge, "in", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
39
|
251 patient.fuzzy = patient.fuzzy[-1,]
|
37
|
252 } else {
|
|
253 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
254 }
|
37
|
255
|
|
256
|
34
|
257 }
|
|
258
|
|
259 }
|
|
260
|
37
|
261
|
18
|
262 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
263 res1 = vector()
|
|
264 res2 = vector()
|
|
265 resBoth = vector()
|
|
266 read1Count = vector()
|
|
267 read2Count = vector()
|
2
|
268 locussum1 = vector()
|
|
269 locussum2 = vector()
|
9
|
270
|
0
|
271 #for(iter in 1){
|
|
272 for(iter in 1:length(product[,1])){
|
|
273 threshhold = product[iter,threshholdIndex]
|
|
274 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
275 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
276 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
277 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
278 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
279 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
280 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
281 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
282 res1 = append(res1, sum(one))
|
2
|
283 res2 = append(res2, sum(two))
|
0
|
284 resBoth = append(resBoth, sum(both))
|
2
|
285 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
286 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
287 #threshhold = 0
|
|
288 if(threshhold != 0){
|
|
289 if(sum(one) > 0){
|
15
|
290 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
291 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
292 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
293 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
294 }
|
|
295 if(sum(two) > 0){
|
15
|
296 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
297 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
298 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
299 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
300 }
|
29
|
301 } else {
|
|
302 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
303 if(nrow(scatterplot_locus_data) > 0){
|
|
304 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
305 }
|
34
|
306 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
307 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
308 not_in_one = !in_one
|
|
309 if(any(in_two)){
|
|
310 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
311 }
|
30
|
312 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
313 if(any(in_both)){
|
|
314 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
315 }
|
29
|
316 if(type == "single"){
|
|
317 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
318 }
|
|
319 p = NULL
|
|
320 if(nrow(scatterplot_locus_data) != 0){
|
|
321 if(on == "normalized_read_count"){
|
30
|
322 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
38
|
323 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6)
|
29
|
324 } else {
|
30
|
325 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
326 }
|
|
327 p = p + geom_point(aes(colour=type), position="jitter")
|
|
328 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
329 } else {
|
|
330 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
331 }
|
|
332 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
333 print(p)
|
|
334 dev.off()
|
0
|
335 }
|
|
336 if(sum(both) > 0){
|
15
|
337 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
338 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
339 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
340 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
341 }
|
0
|
342 }
|
2
|
343 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
344 if(sum(is.na(patientResult$percentage)) > 0){
|
|
345 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
346 }
|
|
347 colnames(patientResult)[6] = oneSample
|
|
348 colnames(patientResult)[8] = twoSample
|
|
349 colnamesBak = colnames(patientResult)
|
2
|
350 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
351 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
352 colnames(patientResult) = colnamesBak
|
|
353
|
|
354 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
355 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
356
|
|
357 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
358 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
359 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
360 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
361 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
362 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
363 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
364 print(plt)
|
|
365 dev.off()
|
|
366 #(t,r,b,l)
|
|
367 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
368 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
369 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
370 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
371 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
372 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
373 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
374 print(plt)
|
|
375 dev.off()
|
|
376
|
|
377 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
378 patientResult$relativeValue = patientResult$value * 10
|
|
379 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
380 plt = ggplot(patientResult)
|
|
381 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
382 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
383 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
384 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
385 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
386 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
387 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
388 print(plt)
|
|
389 dev.off()
|
|
390 }
|
|
391
|
3
|
392 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
393
|
0
|
394 interval = intervalFreq
|
|
395 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
396 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
397 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
398
|
3
|
399 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
400
|
0
|
401 interval = intervalReads
|
|
402 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
403 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
404 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
405
|
3
|
406 cat("</table></html>", file=logfile, append=T)
|
|
407
|
33
|
408
|
|
409 if(nrow(single_patients) > 0){
|
|
410 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
411 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
412 p = p + geom_point(aes(colour=type), position="jitter")
|
|
413 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
414 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
415 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
416 print(p)
|
|
417 dev.off()
|
7
|
418
|
33
|
419 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
420 p = p + geom_point(aes(colour=type), position="jitter")
|
|
421 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
422 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
423 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
424 print(p)
|
|
425 dev.off()
|
|
426 } else {
|
|
427 empty <- data.frame()
|
|
428 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
429
|
|
430 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
431 print(p)
|
|
432 dev.off()
|
|
433
|
|
434 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
435 print(p)
|
|
436 dev.off()
|
|
437 }
|
7
|
438 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
439 onShort = "reads"
|
|
440 if(on == "Frequency"){
|
|
441 onShort = "freq"
|
|
442 }
|
18
|
443 onx = paste(on, ".x", sep="")
|
|
444 ony = paste(on, ".y", sep="")
|
|
445 onz = paste(on, ".z", sep="")
|
7
|
446 type="triplet"
|
|
447
|
|
448 threshholdIndex = which(colnames(product) == "interval")
|
|
449 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
450 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
451 titleIndex = which(colnames(product) == "Titles")
|
|
452 sampleIndex = which(colnames(patient1) == "Sample")
|
|
453 patientIndex = which(colnames(patient1) == "Patient")
|
|
454 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
455 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
456 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
457
|
29
|
458 if(mergeOn == "Clone_Sequence"){
|
|
459 patient1$merge = paste(patient1$Clone_Sequence)
|
|
460 patient2$merge = paste(patient2$Clone_Sequence)
|
|
461 patient3$merge = paste(patient3$Clone_Sequence)
|
|
462
|
|
463 } else {
|
|
464 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
465 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
466 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
467 }
|
9
|
468
|
|
469 patientMerge = merge(patient1, patient2, by="merge")
|
|
470 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
471 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
472 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
473 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
474 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
475 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
476 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
477 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
478 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
479
|
28
|
480
|
30
|
481 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
482 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
483 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
484 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
485
|
7
|
486 res1 = vector()
|
|
487 res2 = vector()
|
|
488 res3 = vector()
|
20
|
489 res12 = vector()
|
|
490 res13 = vector()
|
|
491 res23 = vector()
|
7
|
492 resAll = vector()
|
|
493 read1Count = vector()
|
|
494 read2Count = vector()
|
|
495 read3Count = vector()
|
|
496
|
|
497 if(appendTriplets){
|
9
|
498 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
499 }
|
|
500 for(iter in 1:length(product[,1])){
|
|
501 threshhold = product[iter,threshholdIndex]
|
|
502 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
503 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
504 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
505 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
506
|
30
|
507 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
508 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
509 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
510
|
30
|
511 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
512 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
513 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
514
|
18
|
515 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
516 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
517 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
518 res1 = append(res1, sum(one))
|
|
519 res2 = append(res2, sum(two))
|
|
520 res3 = append(res3, sum(three))
|
|
521 resAll = append(resAll, sum(all))
|
20
|
522 res12 = append(res12, sum(one_two))
|
|
523 res13 = append(res13, sum(one_three))
|
|
524 res23 = append(res23, sum(two_three))
|
7
|
525 #threshhold = 0
|
|
526 if(threshhold != 0){
|
|
527 if(sum(one) > 0){
|
20
|
528 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
529 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
530 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
531 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
532 }
|
|
533 if(sum(two) > 0){
|
20
|
534 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
535 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
536 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
537 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
538 }
|
|
539 if(sum(three) > 0){
|
20
|
540 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
541 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
542 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
543 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
544 }
|
20
|
545 if(sum(one_two) > 0){
|
|
546 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
547 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
548 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
549 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
550 }
|
|
551 if(sum(one_three) > 0){
|
|
552 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
553 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
554 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
555 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
556 }
|
|
557 if(sum(two_three) > 0){
|
|
558 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
559 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
560 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
561 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
562 }
|
|
563 } else { #scatterplot data
|
24
|
564 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
565 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
566 if(sum(in_two) > 0){
|
|
567 scatterplot_locus_data[in_two,]$type = "In two"
|
|
568 }
|
30
|
569 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
570 if(sum(in_three)> 0){
|
|
571 scatterplot_locus_data[in_three,]$type = "In three"
|
|
572 }
|
|
573 not_in_one = scatterplot_locus_data$type != "In one"
|
|
574 if(sum(not_in_one) > 0){
|
|
575 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
576 }
|
20
|
577 p = NULL
|
|
578 if(nrow(scatterplot_locus_data) != 0){
|
|
579 if(on == "normalized_read_count"){
|
31
|
580 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
581 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
582 } else {
|
32
|
583 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
584 }
|
|
585 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
586 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
587 } else {
|
|
588 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
589 }
|
|
590 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
591 print(p)
|
|
592 dev.off()
|
|
593 }
|
7
|
594 if(sum(all) > 0){
|
20
|
595 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
596 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
597 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
598 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
599 }
|
|
600 }
|
20
|
601 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
602 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
603 colnames(patientResult)[6] = oneSample
|
20
|
604 colnames(patientResult)[7] = twoSample
|
|
605 colnames(patientResult)[8] = threeSample
|
|
606 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
607 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
608 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
609
|
|
610 colnamesBak = colnames(patientResult)
|
20
|
611 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
612 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
613 colnames(patientResult) = colnamesBak
|
|
614
|
|
615 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
616 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
617
|
|
618 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
619 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
620 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
621 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
622 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
623 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
624 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
625 print(plt)
|
|
626 dev.off()
|
|
627
|
|
628 fontSize = 4
|
|
629
|
|
630 bak = patientResult
|
|
631 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
632 patientResult$relativeValue = patientResult$value * 10
|
|
633 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
634 plt = ggplot(patientResult)
|
|
635 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
636 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
637 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
638 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
639 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
640 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
641 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
642 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
643 print(plt)
|
|
644 dev.off()
|
|
645 }
|
|
646
|
33
|
647 if(nrow(triplets) != 0){
|
|
648 triplets$uniqueID = "ID"
|
|
649
|
|
650 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
651 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
652 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
653
|
|
654 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
655 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
656 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
657
|
|
658 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
659
|
|
660 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
661 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
662 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
663
|
|
664 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
665 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
666
|
|
667 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
668
|
|
669 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
670
|
|
671 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
672
|
|
673 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
674
|
|
675 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
676
|
|
677 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
678
|
|
679 #remove duplicate V+J+CDR3, add together numerical values
|
|
680 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
681 Cell_Count=unique(.SD$Cell_Count),
|
|
682 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
683 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
684 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
685 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
686 Frequency=sum(.SD$Frequency),
|
|
687 normalized_read_count=sum(.SD$normalized_read_count),
|
|
688 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
689 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
690
|
|
691
|
|
692 interval = intervalReads
|
|
693 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
694 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
695
|
|
696 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
697 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
698 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
699 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
700
|
|
701 one = triplets[triplets$Sample == "16278_Left",]
|
|
702 two = triplets[triplets$Sample == "26402_Left",]
|
|
703 three = triplets[triplets$Sample == "26759_Left",]
|
|
704 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
705
|
|
706 one = triplets[triplets$Sample == "16278_Right",]
|
|
707 two = triplets[triplets$Sample == "26402_Right",]
|
|
708 three = triplets[triplets$Sample == "26759_Right",]
|
|
709 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
710
|
|
711
|
|
712 interval = intervalFreq
|
|
713 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
714 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
715
|
|
716 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
717 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
718 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
719 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
720
|
|
721 one = triplets[triplets$Sample == "16278_Left",]
|
|
722 two = triplets[triplets$Sample == "26402_Left",]
|
|
723 three = triplets[triplets$Sample == "26759_Left",]
|
|
724 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
725
|
|
726 one = triplets[triplets$Sample == "16278_Right",]
|
|
727 two = triplets[triplets$Sample == "26402_Right",]
|
|
728 three = triplets[triplets$Sample == "26759_Right",]
|
|
729 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
730 } else {
|
|
731 cat("", file="triplets.txt")
|
|
732 }
|