0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
|
159 patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
|
|
163 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
164
|
|
165 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
166 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
167
|
|
168 while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
|
|
169 current.merge.1 = patient1.fuzzy[1,"merge"]
|
|
170 current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
|
|
171 current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
|
|
172
|
|
173 #agrep/adist the two samples
|
|
174 agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1))
|
|
175
|
|
176
|
|
177 if(length(agrep.match) == 1){
|
|
178 current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
|
|
179 patientMerge.new.row = data.frame(merge=current.clone.seq.1,
|
|
180 min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
|
|
181 Patient.x=patient1.fuzzy[1,"Patient"],
|
|
182 Receptor.x=patient1.fuzzy[1,"Receptor"],
|
|
183 Sample.x=patient1.fuzzy[1,"Sample"],
|
|
184 Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
|
|
185 Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
|
|
186 Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
|
|
187 Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
|
|
188 dsPerM.x=patient1.fuzzy[1,"dsPerM"],
|
|
189 J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
|
|
190 V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
|
|
191 Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
|
|
192 CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
|
|
193 Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
|
|
194 Frequency.x=patient1.fuzzy[1,"Frequency"],
|
|
195 locus_V.x=patient1.fuzzy[1,"locus_V"],
|
|
196 locus_J.x=patient1.fuzzy[1,"locus_J"],
|
|
197 min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
|
|
198 normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
|
|
199 paste.x=patient1.fuzzy[1,"paste"],
|
|
200 min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
|
|
201 Patient.y=patient2.fuzzy[agrep.match,"Patient"],
|
|
202 Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
|
|
203 Sample.y=patient2.fuzzy[agrep.match,"Sample"],
|
|
204 Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
|
|
205 Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
|
|
206 Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
|
|
207 Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
|
|
208 dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
|
|
209 J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
|
|
210 V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
|
|
211 Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
|
|
212 CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
|
|
213 Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
|
|
214 Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
|
|
215 locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
|
|
216 locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
|
|
217 min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
|
|
218 normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
|
|
219 paste.y=patient2.fuzzy[agrep.match,"paste"])
|
|
220 #add to patientMerge
|
|
221 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
222 #remove from patient*.fuzzy
|
|
223
|
|
224
|
|
225 #remove the fuzzy merged clone sequences from the original datasets
|
|
226 patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
|
|
227 patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
|
|
228
|
|
229 scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
|
|
230
|
|
231 patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
|
|
232
|
|
233
|
|
234 } else if (length(agrep.match) > 1){
|
|
235 #multiple matches, whatdo?
|
|
236 cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, " in ", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
|
237 }
|
|
238 patient1.fuzzy = patient1.fuzzy[-1,]
|
|
239 }
|
|
240
|
|
241 #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
|
|
242 }
|
|
243
|
18
|
244 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
245 res1 = vector()
|
|
246 res2 = vector()
|
|
247 resBoth = vector()
|
|
248 read1Count = vector()
|
|
249 read2Count = vector()
|
2
|
250 locussum1 = vector()
|
|
251 locussum2 = vector()
|
9
|
252
|
0
|
253 #for(iter in 1){
|
|
254 for(iter in 1:length(product[,1])){
|
|
255 threshhold = product[iter,threshholdIndex]
|
|
256 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
257 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
258 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
259 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
260 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
261 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
262 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
263 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
264 res1 = append(res1, sum(one))
|
2
|
265 res2 = append(res2, sum(two))
|
0
|
266 resBoth = append(resBoth, sum(both))
|
2
|
267 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
268 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
269 #threshhold = 0
|
|
270 if(threshhold != 0){
|
|
271 if(sum(one) > 0){
|
15
|
272 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
273 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
274 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
275 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
276 }
|
|
277 if(sum(two) > 0){
|
15
|
278 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
279 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
280 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
281 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
282 }
|
29
|
283 } else {
|
|
284 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
285 if(nrow(scatterplot_locus_data) > 0){
|
|
286 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
287 }
|
34
|
288 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
289 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
290 not_in_one = !in_one
|
|
291 if(any(in_two)){
|
|
292 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
293 }
|
30
|
294 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
295 if(any(in_both)){
|
|
296 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
297 }
|
29
|
298 if(type == "single"){
|
|
299 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
300 }
|
|
301 p = NULL
|
|
302 if(nrow(scatterplot_locus_data) != 0){
|
|
303 if(on == "normalized_read_count"){
|
30
|
304 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
34
|
305 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
|
29
|
306 } else {
|
30
|
307 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
308 }
|
|
309 p = p + geom_point(aes(colour=type), position="jitter")
|
|
310 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
311 } else {
|
|
312 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
313 }
|
|
314 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
315 print(p)
|
|
316 dev.off()
|
0
|
317 }
|
|
318 if(sum(both) > 0){
|
15
|
319 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
320 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
321 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
322 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
323 }
|
0
|
324 }
|
2
|
325 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
326 if(sum(is.na(patientResult$percentage)) > 0){
|
|
327 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
328 }
|
|
329 colnames(patientResult)[6] = oneSample
|
|
330 colnames(patientResult)[8] = twoSample
|
|
331 colnamesBak = colnames(patientResult)
|
2
|
332 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
333 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
334 colnames(patientResult) = colnamesBak
|
|
335
|
|
336 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
337 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
338
|
|
339 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
340 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
341 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
342 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
343 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
344 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
345 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
346 print(plt)
|
|
347 dev.off()
|
|
348 #(t,r,b,l)
|
|
349 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
350 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
351 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
352 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
353 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
354 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
355 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
356 print(plt)
|
|
357 dev.off()
|
|
358
|
|
359 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
360 patientResult$relativeValue = patientResult$value * 10
|
|
361 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
362 plt = ggplot(patientResult)
|
|
363 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
364 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
365 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
366 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
367 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
368 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
369 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
370 print(plt)
|
|
371 dev.off()
|
|
372 }
|
|
373
|
3
|
374 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
375
|
0
|
376 interval = intervalFreq
|
|
377 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
378 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
379 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
380
|
3
|
381 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
382
|
0
|
383 interval = intervalReads
|
|
384 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
385 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
386 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
387
|
3
|
388 cat("</table></html>", file=logfile, append=T)
|
|
389
|
33
|
390
|
|
391 if(nrow(single_patients) > 0){
|
|
392 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
393 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
394 p = p + geom_point(aes(colour=type), position="jitter")
|
|
395 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
396 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
397 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
398 print(p)
|
|
399 dev.off()
|
7
|
400
|
33
|
401 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
402 p = p + geom_point(aes(colour=type), position="jitter")
|
|
403 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
404 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
405 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
406 print(p)
|
|
407 dev.off()
|
|
408 } else {
|
|
409 empty <- data.frame()
|
|
410 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
411
|
|
412 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
413 print(p)
|
|
414 dev.off()
|
|
415
|
|
416 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
417 print(p)
|
|
418 dev.off()
|
|
419 }
|
7
|
420 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
421 onShort = "reads"
|
|
422 if(on == "Frequency"){
|
|
423 onShort = "freq"
|
|
424 }
|
18
|
425 onx = paste(on, ".x", sep="")
|
|
426 ony = paste(on, ".y", sep="")
|
|
427 onz = paste(on, ".z", sep="")
|
7
|
428 type="triplet"
|
|
429
|
|
430 threshholdIndex = which(colnames(product) == "interval")
|
|
431 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
432 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
433 titleIndex = which(colnames(product) == "Titles")
|
|
434 sampleIndex = which(colnames(patient1) == "Sample")
|
|
435 patientIndex = which(colnames(patient1) == "Patient")
|
|
436 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
437 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
438 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
439
|
29
|
440 if(mergeOn == "Clone_Sequence"){
|
|
441 patient1$merge = paste(patient1$Clone_Sequence)
|
|
442 patient2$merge = paste(patient2$Clone_Sequence)
|
|
443 patient3$merge = paste(patient3$Clone_Sequence)
|
|
444
|
|
445 } else {
|
|
446 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
447 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
448 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
449 }
|
9
|
450
|
|
451 patientMerge = merge(patient1, patient2, by="merge")
|
|
452 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
453 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
454 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
455 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
456 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
457 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
458 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
459 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
460 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
461
|
28
|
462
|
30
|
463 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
464 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
465 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
466 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
467
|
7
|
468 res1 = vector()
|
|
469 res2 = vector()
|
|
470 res3 = vector()
|
20
|
471 res12 = vector()
|
|
472 res13 = vector()
|
|
473 res23 = vector()
|
7
|
474 resAll = vector()
|
|
475 read1Count = vector()
|
|
476 read2Count = vector()
|
|
477 read3Count = vector()
|
|
478
|
|
479 if(appendTriplets){
|
9
|
480 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
481 }
|
|
482 for(iter in 1:length(product[,1])){
|
|
483 threshhold = product[iter,threshholdIndex]
|
|
484 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
485 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
486 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
487 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
488
|
30
|
489 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
490 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
491 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
492
|
30
|
493 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
494 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
495 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
496
|
18
|
497 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
498 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
499 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
500 res1 = append(res1, sum(one))
|
|
501 res2 = append(res2, sum(two))
|
|
502 res3 = append(res3, sum(three))
|
|
503 resAll = append(resAll, sum(all))
|
20
|
504 res12 = append(res12, sum(one_two))
|
|
505 res13 = append(res13, sum(one_three))
|
|
506 res23 = append(res23, sum(two_three))
|
7
|
507 #threshhold = 0
|
|
508 if(threshhold != 0){
|
|
509 if(sum(one) > 0){
|
20
|
510 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
511 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
512 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
513 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
514 }
|
|
515 if(sum(two) > 0){
|
20
|
516 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
517 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
518 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
519 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
520 }
|
|
521 if(sum(three) > 0){
|
20
|
522 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
523 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
524 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
525 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
526 }
|
20
|
527 if(sum(one_two) > 0){
|
|
528 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
529 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
530 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
531 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
532 }
|
|
533 if(sum(one_three) > 0){
|
|
534 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
535 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
536 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
537 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
538 }
|
|
539 if(sum(two_three) > 0){
|
|
540 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
541 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
542 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
543 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
544 }
|
|
545 } else { #scatterplot data
|
24
|
546 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
547 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
548 if(sum(in_two) > 0){
|
|
549 scatterplot_locus_data[in_two,]$type = "In two"
|
|
550 }
|
30
|
551 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
552 if(sum(in_three)> 0){
|
|
553 scatterplot_locus_data[in_three,]$type = "In three"
|
|
554 }
|
|
555 not_in_one = scatterplot_locus_data$type != "In one"
|
|
556 if(sum(not_in_one) > 0){
|
|
557 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
558 }
|
20
|
559 p = NULL
|
|
560 if(nrow(scatterplot_locus_data) != 0){
|
|
561 if(on == "normalized_read_count"){
|
31
|
562 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
563 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
564 } else {
|
32
|
565 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
566 }
|
|
567 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
568 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
569 } else {
|
|
570 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
571 }
|
|
572 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
573 print(p)
|
|
574 dev.off()
|
|
575 }
|
7
|
576 if(sum(all) > 0){
|
20
|
577 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
578 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
579 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
580 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
581 }
|
|
582 }
|
20
|
583 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
584 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
585 colnames(patientResult)[6] = oneSample
|
20
|
586 colnames(patientResult)[7] = twoSample
|
|
587 colnames(patientResult)[8] = threeSample
|
|
588 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
589 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
590 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
591
|
|
592 colnamesBak = colnames(patientResult)
|
20
|
593 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
594 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
595 colnames(patientResult) = colnamesBak
|
|
596
|
|
597 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
598 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
599
|
|
600 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
601 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
602 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
603 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
604 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
605 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
606 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
607 print(plt)
|
|
608 dev.off()
|
|
609
|
|
610 fontSize = 4
|
|
611
|
|
612 bak = patientResult
|
|
613 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
614 patientResult$relativeValue = patientResult$value * 10
|
|
615 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
616 plt = ggplot(patientResult)
|
|
617 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
618 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
619 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
620 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
621 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
622 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
623 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
624 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
625 print(plt)
|
|
626 dev.off()
|
|
627 }
|
|
628
|
33
|
629 if(nrow(triplets) != 0){
|
|
630 triplets$uniqueID = "ID"
|
|
631
|
|
632 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
633 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
634 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
635
|
|
636 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
637 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
638 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
639
|
|
640 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
641
|
|
642 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
643 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
644 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
645
|
|
646 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
647 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
648
|
|
649 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
650
|
|
651 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
652
|
|
653 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
654
|
|
655 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
656
|
|
657 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
658
|
|
659 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
660
|
|
661 #remove duplicate V+J+CDR3, add together numerical values
|
|
662 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
663 Cell_Count=unique(.SD$Cell_Count),
|
|
664 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
665 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
666 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
667 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
668 Frequency=sum(.SD$Frequency),
|
|
669 normalized_read_count=sum(.SD$normalized_read_count),
|
|
670 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
671 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
672
|
|
673
|
|
674 interval = intervalReads
|
|
675 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
676 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
677
|
|
678 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
679 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
680 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
681 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
682
|
|
683 one = triplets[triplets$Sample == "16278_Left",]
|
|
684 two = triplets[triplets$Sample == "26402_Left",]
|
|
685 three = triplets[triplets$Sample == "26759_Left",]
|
|
686 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
687
|
|
688 one = triplets[triplets$Sample == "16278_Right",]
|
|
689 two = triplets[triplets$Sample == "26402_Right",]
|
|
690 three = triplets[triplets$Sample == "26759_Right",]
|
|
691 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
692
|
|
693
|
|
694 interval = intervalFreq
|
|
695 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
696 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
697
|
|
698 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
699 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
700 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
701 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
702
|
|
703 one = triplets[triplets$Sample == "16278_Left",]
|
|
704 two = triplets[triplets$Sample == "26402_Left",]
|
|
705 three = triplets[triplets$Sample == "26759_Left",]
|
|
706 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
707
|
|
708 one = triplets[triplets$Sample == "16278_Right",]
|
|
709 two = triplets[triplets$Sample == "26402_Right",]
|
|
710 three = triplets[triplets$Sample == "26759_Right",]
|
|
711 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
712 } else {
|
|
713 cat("", file="triplets.txt")
|
|
714 }
|