Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 34:37d9074ef2c6 draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 26 Aug 2015 05:19:55 -0400 |
parents | 642b4593f0a4 |
children | 32d8a5abed4c |
comparison
equal
deleted
inserted
replaced
33:642b4593f0a4 | 34:37d9074ef2c6 |
---|---|
15 library(grid) | 15 library(grid) |
16 library(parallel) | 16 library(parallel) |
17 #require(xtable) | 17 #require(xtable) |
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) | 19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) |
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")] | |
21 dat$dsPerM = 0 | |
20 dat = dat[!is.na(dat$Patient),] | 22 dat = dat[!is.na(dat$Patient),] |
21 dat$Related_to_leukemia_clone = F | 23 dat$Related_to_leukemia_clone = F |
22 | 24 |
23 setwd(outDir) | 25 setwd(outDir) |
24 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
41 | 43 |
42 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) | 44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) |
43 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) | 45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) |
44 | 46 |
45 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | 47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] |
46 | 48 print(paste("rows:", nrow(dat))) |
47 dat = merge(dat, min_cell_count, by="min_cell_paste") | 49 dat = merge(dat, min_cell_count, by="min_cell_paste") |
48 | 50 print(paste("rows:", nrow(dat))) |
49 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | 51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 |
50 | 52 |
51 dat = dat[dat$normalized_read_count >= min_cells,] | 53 dat = dat[dat$normalized_read_count >= min_cells,] |
52 | 54 |
53 dat$paste = paste(dat$Sample, dat$Clone_Sequence) | 55 dat$paste = paste(dat$Sample, dat$Clone_Sequence) |
54 | 56 |
55 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T) | |
56 #remove duplicate V+J+CDR3, add together numerical values | 57 #remove duplicate V+J+CDR3, add together numerical values |
57 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), | 58 if(mergeOn != "Clone_Sequence"){ |
58 Cell_Count=unique(.SD$Cell_Count), | 59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T) |
59 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), | 60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), |
60 Total_Read_Count=sum(.SD$Total_Read_Count), | 61 Cell_Count=unique(.SD$Cell_Count), |
61 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), | 62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), |
62 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), | 63 Total_Read_Count=sum(.SD$Total_Read_Count), |
63 Frequency=sum(.SD$Frequency), | 64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), |
64 locus_V=unique(.SD$locus_V), | 65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), |
65 locus_J=unique(.SD$locus_J), | 66 Frequency=sum(.SD$Frequency), |
66 min_cell_count=unique(.SD$min_cell_count), | 67 locus_V=unique(.SD$locus_V), |
67 normalized_read_count=sum(.SD$normalized_read_count), | 68 locus_J=unique(.SD$locus_J), |
68 Log10_Frequency=sum(.SD$Log10_Frequency), | 69 min_cell_count=unique(.SD$min_cell_count), |
69 Clone_Sequence=.SD$Clone_Sequence[1], | 70 normalized_read_count=sum(.SD$normalized_read_count), |
70 min_cell_paste=.SD$min_cell_paste[1], | 71 Log10_Frequency=sum(.SD$Log10_Frequency), |
71 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 72 Clone_Sequence=.SD$Clone_Sequence[1], |
72 | 73 min_cell_paste=.SD$min_cell_paste[1], |
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
75 } | |
73 | 76 |
74 patients = split(dat, dat$Patient, drop=T) | 77 patients = split(dat, dat$Patient, drop=T) |
75 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) | 78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) |
76 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) | 79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) |
77 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
84 | 87 |
85 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ | 88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ |
86 if (!is.data.frame(x) & is.list(x)){ | 89 if (!is.data.frame(x) & is.list(x)){ |
87 x = x[[1]] | 90 x = x[[1]] |
88 } | 91 } |
89 x$Sample = factor(x$Sample, levels=unique(x$Sample)) | 92 #x$Sample = factor(x$Sample, levels=unique(x$Sample)) |
93 x = data.frame(x,stringsAsFactors=F) | |
90 onShort = "reads" | 94 onShort = "reads" |
91 if(on == "Frequency"){ | 95 if(on == "Frequency"){ |
92 onShort = "freq" | 96 onShort = "freq" |
93 } | 97 } |
94 onx = paste(on, ".x", sep="") | 98 onx = paste(on, ".x", sep="") |
141 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] | 145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] |
142 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) | 146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) |
143 scatterplot_data$on = onShort | 147 scatterplot_data$on = onShort |
144 | 148 |
145 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
150 | |
151 | |
152 #fuzzy matching here... | |
153 if(mergeOn == "Clone_Sequence"){ | |
154 merge.list = patientMerge$merge | |
155 | |
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] | |
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] | |
158 | |
159 patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) | |
160 patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) | |
161 | |
162 merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge))) | |
163 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] | |
164 | |
165 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
166 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
167 | |
168 while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){ | |
169 current.merge.1 = patient1.fuzzy[1,"merge"] | |
170 current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"] | |
171 current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,] | |
172 | |
173 #agrep/adist the two samples | |
174 agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1)) | |
175 | |
176 | |
177 if(length(agrep.match) == 1){ | |
178 current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"] | |
179 patientMerge.new.row = data.frame(merge=current.clone.seq.1, | |
180 min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"], | |
181 Patient.x=patient1.fuzzy[1,"Patient"], | |
182 Receptor.x=patient1.fuzzy[1,"Receptor"], | |
183 Sample.x=patient1.fuzzy[1,"Sample"], | |
184 Cell_Count.x=patient1.fuzzy[1,"Cell_Count"], | |
185 Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"], | |
186 Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"], | |
187 Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"], | |
188 dsPerM.x=patient1.fuzzy[1,"dsPerM"], | |
189 J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"], | |
190 V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"], | |
191 Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"], | |
192 CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"], | |
193 Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"], | |
194 Frequency.x=patient1.fuzzy[1,"Frequency"], | |
195 locus_V.x=patient1.fuzzy[1,"locus_V"], | |
196 locus_J.x=patient1.fuzzy[1,"locus_J"], | |
197 min_cell_count.x=patient1.fuzzy[1,"min_cell_count"], | |
198 normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"], | |
199 paste.x=patient1.fuzzy[1,"paste"], | |
200 min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"], | |
201 Patient.y=patient2.fuzzy[agrep.match,"Patient"], | |
202 Receptor.y=patient2.fuzzy[agrep.match,"Receptor"], | |
203 Sample.y=patient2.fuzzy[agrep.match,"Sample"], | |
204 Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"], | |
205 Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"], | |
206 Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"], | |
207 Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"], | |
208 dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"], | |
209 J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"], | |
210 V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"], | |
211 Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"], | |
212 CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"], | |
213 Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"], | |
214 Frequency.y=patient2.fuzzy[agrep.match,"Frequency"], | |
215 locus_V.y=patient2.fuzzy[agrep.match,"locus_V"], | |
216 locus_J.y=patient2.fuzzy[agrep.match,"locus_J"], | |
217 min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"], | |
218 normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"], | |
219 paste.y=patient2.fuzzy[agrep.match,"paste"]) | |
220 #add to patientMerge | |
221 patientMerge = rbind(patientMerge, patientMerge.new.row) | |
222 #remove from patient*.fuzzy | |
223 | |
224 | |
225 #remove the fuzzy merged clone sequences from the original datasets | |
226 patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],] | |
227 patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],] | |
228 | |
229 scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,] | |
230 | |
231 patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),] | |
232 | |
233 | |
234 } else if (length(agrep.match) > 1){ | |
235 #multiple matches, whatdo? | |
236 cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, " in ", patient, "</td></tr>", sep=""), file=logfile, append=T) | |
237 } | |
238 patient1.fuzzy = patient1.fuzzy[-1,] | |
239 } | |
240 | |
241 #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1)) | |
242 } | |
243 | |
146 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) | 244 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) |
147 res1 = vector() | 245 res1 = vector() |
148 res2 = vector() | 246 res2 = vector() |
149 resBoth = vector() | 247 resBoth = vector() |
150 read1Count = vector() | 248 read1Count = vector() |
185 } else { | 283 } else { |
186 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] | 284 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] |
187 if(nrow(scatterplot_locus_data) > 0){ | 285 if(nrow(scatterplot_locus_data) > 0){ |
188 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] | 286 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] |
189 } | 287 } |
288 in_one = (scatterplot_locus_data$merge %in% patient1$merge) | |
289 in_two = (scatterplot_locus_data$merge %in% patient2$merge) | |
290 not_in_one = !in_one | |
291 if(any(in_two)){ | |
292 scatterplot_locus_data[not_in_one,]$type = twoSample | |
293 } | |
190 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) | 294 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) |
191 if(any(in_both)){ | 295 if(any(in_both)){ |
192 scatterplot_locus_data[in_both,]$type = "In Both" | 296 scatterplot_locus_data[in_both,]$type = "In Both" |
193 } | |
194 in_one = (scatterplot_locus_data$merge %in% patient1$merge) | |
195 not_in_one = !in_one | |
196 if(any(not_in_one)){ | |
197 scatterplot_locus_data[not_in_one,]$type = twoSample | |
198 } | 297 } |
199 if(type == "single"){ | 298 if(type == "single"){ |
200 single_patients <<- rbind(single_patients, scatterplot_locus_data) | 299 single_patients <<- rbind(single_patients, scatterplot_locus_data) |
201 } | 300 } |
202 p = NULL | 301 p = NULL |
203 if(nrow(scatterplot_locus_data) != 0){ | 302 if(nrow(scatterplot_locus_data) != 0){ |
204 if(on == "normalized_read_count"){ | 303 if(on == "normalized_read_count"){ |
205 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) | 304 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) |
206 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) | 305 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) |
207 } else { | 306 } else { |
208 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) | 307 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) |
209 } | 308 } |
210 p = p + geom_point(aes(colour=type), position="jitter") | 309 p = p + geom_point(aes(colour=type), position="jitter") |
211 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) | 310 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) |