comparison RScript.r @ 34:37d9074ef2c6 draft

Uploaded
author davidvanzessen
date Wed, 26 Aug 2015 05:19:55 -0400
parents 642b4593f0a4
children 32d8a5abed4c
comparison
equal deleted inserted replaced
33:642b4593f0a4 34:37d9074ef2c6
15 library(grid) 15 library(grid)
16 library(parallel) 16 library(parallel)
17 #require(xtable) 17 #require(xtable)
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) 18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) 19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
21 dat$dsPerM = 0
20 dat = dat[!is.na(dat$Patient),] 22 dat = dat[!is.na(dat$Patient),]
21 dat$Related_to_leukemia_clone = F 23 dat$Related_to_leukemia_clone = F
22 24
23 setwd(outDir) 25 setwd(outDir)
24 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) 26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
41 43
42 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) 44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
43 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) 45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
44 46
45 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] 47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
46 48 print(paste("rows:", nrow(dat)))
47 dat = merge(dat, min_cell_count, by="min_cell_paste") 49 dat = merge(dat, min_cell_count, by="min_cell_paste")
48 50 print(paste("rows:", nrow(dat)))
49 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
50 52
51 dat = dat[dat$normalized_read_count >= min_cells,] 53 dat = dat[dat$normalized_read_count >= min_cells,]
52 54
53 dat$paste = paste(dat$Sample, dat$Clone_Sequence) 55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
54 56
55 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
56 #remove duplicate V+J+CDR3, add together numerical values 57 #remove duplicate V+J+CDR3, add together numerical values
57 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), 58 if(mergeOn != "Clone_Sequence"){
58 Cell_Count=unique(.SD$Cell_Count), 59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
59 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), 60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
60 Total_Read_Count=sum(.SD$Total_Read_Count), 61 Cell_Count=unique(.SD$Cell_Count),
61 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), 62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
62 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), 63 Total_Read_Count=sum(.SD$Total_Read_Count),
63 Frequency=sum(.SD$Frequency), 64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
64 locus_V=unique(.SD$locus_V), 65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
65 locus_J=unique(.SD$locus_J), 66 Frequency=sum(.SD$Frequency),
66 min_cell_count=unique(.SD$min_cell_count), 67 locus_V=unique(.SD$locus_V),
67 normalized_read_count=sum(.SD$normalized_read_count), 68 locus_J=unique(.SD$locus_J),
68 Log10_Frequency=sum(.SD$Log10_Frequency), 69 min_cell_count=unique(.SD$min_cell_count),
69 Clone_Sequence=.SD$Clone_Sequence[1], 70 normalized_read_count=sum(.SD$normalized_read_count),
70 min_cell_paste=.SD$min_cell_paste[1], 71 Log10_Frequency=sum(.SD$Log10_Frequency),
71 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) 72 Clone_Sequence=.SD$Clone_Sequence[1],
72 73 min_cell_paste=.SD$min_cell_paste[1],
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
75 }
73 76
74 patients = split(dat, dat$Patient, drop=T) 77 patients = split(dat, dat$Patient, drop=T)
75 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) 78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
76 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) 79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
77 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") 80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
84 87
85 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ 88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
86 if (!is.data.frame(x) & is.list(x)){ 89 if (!is.data.frame(x) & is.list(x)){
87 x = x[[1]] 90 x = x[[1]]
88 } 91 }
89 x$Sample = factor(x$Sample, levels=unique(x$Sample)) 92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
93 x = data.frame(x,stringsAsFactors=F)
90 onShort = "reads" 94 onShort = "reads"
91 if(on == "Frequency"){ 95 if(on == "Frequency"){
92 onShort = "freq" 96 onShort = "freq"
93 } 97 }
94 onx = paste(on, ".x", sep="") 98 onx = paste(on, ".x", sep="")
141 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] 145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
142 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) 146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
143 scatterplot_data$on = onShort 147 scatterplot_data$on = onShort
144 148
145 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") 149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
150
151
152 #fuzzy matching here...
153 if(mergeOn == "Clone_Sequence"){
154 merge.list = patientMerge$merge
155
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
158
159 patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
160 patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
161
162 merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
163 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
164
165 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
166 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
167
168 while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
169 current.merge.1 = patient1.fuzzy[1,"merge"]
170 current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
171 current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
172
173 #agrep/adist the two samples
174 agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1))
175
176
177 if(length(agrep.match) == 1){
178 current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
179 patientMerge.new.row = data.frame(merge=current.clone.seq.1,
180 min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
181 Patient.x=patient1.fuzzy[1,"Patient"],
182 Receptor.x=patient1.fuzzy[1,"Receptor"],
183 Sample.x=patient1.fuzzy[1,"Sample"],
184 Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
185 Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
186 Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
187 Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
188 dsPerM.x=patient1.fuzzy[1,"dsPerM"],
189 J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
190 V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
191 Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
192 CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
193 Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
194 Frequency.x=patient1.fuzzy[1,"Frequency"],
195 locus_V.x=patient1.fuzzy[1,"locus_V"],
196 locus_J.x=patient1.fuzzy[1,"locus_J"],
197 min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
198 normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
199 paste.x=patient1.fuzzy[1,"paste"],
200 min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
201 Patient.y=patient2.fuzzy[agrep.match,"Patient"],
202 Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
203 Sample.y=patient2.fuzzy[agrep.match,"Sample"],
204 Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
205 Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
206 Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
207 Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
208 dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
209 J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
210 V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
211 Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
212 CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
213 Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
214 Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
215 locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
216 locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
217 min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
218 normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
219 paste.y=patient2.fuzzy[agrep.match,"paste"])
220 #add to patientMerge
221 patientMerge = rbind(patientMerge, patientMerge.new.row)
222 #remove from patient*.fuzzy
223
224
225 #remove the fuzzy merged clone sequences from the original datasets
226 patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
227 patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
228
229 scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
230
231 patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
232
233
234 } else if (length(agrep.match) > 1){
235 #multiple matches, whatdo?
236 cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, " in ", patient, "</td></tr>", sep=""), file=logfile, append=T)
237 }
238 patient1.fuzzy = patient1.fuzzy[-1,]
239 }
240
241 #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
242 }
243
146 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) 244 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
147 res1 = vector() 245 res1 = vector()
148 res2 = vector() 246 res2 = vector()
149 resBoth = vector() 247 resBoth = vector()
150 read1Count = vector() 248 read1Count = vector()
185 } else { 283 } else {
186 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] 284 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
187 if(nrow(scatterplot_locus_data) > 0){ 285 if(nrow(scatterplot_locus_data) > 0){
188 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] 286 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
189 } 287 }
288 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
289 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
290 not_in_one = !in_one
291 if(any(in_two)){
292 scatterplot_locus_data[not_in_one,]$type = twoSample
293 }
190 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) 294 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
191 if(any(in_both)){ 295 if(any(in_both)){
192 scatterplot_locus_data[in_both,]$type = "In Both" 296 scatterplot_locus_data[in_both,]$type = "In Both"
193 }
194 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
195 not_in_one = !in_one
196 if(any(not_in_one)){
197 scatterplot_locus_data[not_in_one,]$type = twoSample
198 } 297 }
199 if(type == "single"){ 298 if(type == "single"){
200 single_patients <<- rbind(single_patients, scatterplot_locus_data) 299 single_patients <<- rbind(single_patients, scatterplot_locus_data)
201 } 300 }
202 p = NULL 301 p = NULL
203 if(nrow(scatterplot_locus_data) != 0){ 302 if(nrow(scatterplot_locus_data) != 0){
204 if(on == "normalized_read_count"){ 303 if(on == "normalized_read_count"){
205 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) 304 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
206 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) 305 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
207 } else { 306 } else {
208 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) 307 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
209 } 308 }
210 p = p + geom_point(aes(colour=type), position="jitter") 309 p = p + geom_point(aes(colour=type), position="jitter")
211 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) 310 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))