Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 34:37d9074ef2c6 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Wed, 26 Aug 2015 05:19:55 -0400 |
| parents | 642b4593f0a4 |
| children | 32d8a5abed4c |
comparison
equal
deleted
inserted
replaced
| 33:642b4593f0a4 | 34:37d9074ef2c6 |
|---|---|
| 15 library(grid) | 15 library(grid) |
| 16 library(parallel) | 16 library(parallel) |
| 17 #require(xtable) | 17 #require(xtable) |
| 18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
| 19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) | 19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) |
| 20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")] | |
| 21 dat$dsPerM = 0 | |
| 20 dat = dat[!is.na(dat$Patient),] | 22 dat = dat[!is.na(dat$Patient),] |
| 21 dat$Related_to_leukemia_clone = F | 23 dat$Related_to_leukemia_clone = F |
| 22 | 24 |
| 23 setwd(outDir) | 25 setwd(outDir) |
| 24 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
| 41 | 43 |
| 42 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) | 44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) |
| 43 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) | 45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) |
| 44 | 46 |
| 45 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | 47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] |
| 46 | 48 print(paste("rows:", nrow(dat))) |
| 47 dat = merge(dat, min_cell_count, by="min_cell_paste") | 49 dat = merge(dat, min_cell_count, by="min_cell_paste") |
| 48 | 50 print(paste("rows:", nrow(dat))) |
| 49 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | 51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 |
| 50 | 52 |
| 51 dat = dat[dat$normalized_read_count >= min_cells,] | 53 dat = dat[dat$normalized_read_count >= min_cells,] |
| 52 | 54 |
| 53 dat$paste = paste(dat$Sample, dat$Clone_Sequence) | 55 dat$paste = paste(dat$Sample, dat$Clone_Sequence) |
| 54 | 56 |
| 55 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T) | |
| 56 #remove duplicate V+J+CDR3, add together numerical values | 57 #remove duplicate V+J+CDR3, add together numerical values |
| 57 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), | 58 if(mergeOn != "Clone_Sequence"){ |
| 58 Cell_Count=unique(.SD$Cell_Count), | 59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T) |
| 59 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), | 60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), |
| 60 Total_Read_Count=sum(.SD$Total_Read_Count), | 61 Cell_Count=unique(.SD$Cell_Count), |
| 61 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), | 62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), |
| 62 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), | 63 Total_Read_Count=sum(.SD$Total_Read_Count), |
| 63 Frequency=sum(.SD$Frequency), | 64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), |
| 64 locus_V=unique(.SD$locus_V), | 65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), |
| 65 locus_J=unique(.SD$locus_J), | 66 Frequency=sum(.SD$Frequency), |
| 66 min_cell_count=unique(.SD$min_cell_count), | 67 locus_V=unique(.SD$locus_V), |
| 67 normalized_read_count=sum(.SD$normalized_read_count), | 68 locus_J=unique(.SD$locus_J), |
| 68 Log10_Frequency=sum(.SD$Log10_Frequency), | 69 min_cell_count=unique(.SD$min_cell_count), |
| 69 Clone_Sequence=.SD$Clone_Sequence[1], | 70 normalized_read_count=sum(.SD$normalized_read_count), |
| 70 min_cell_paste=.SD$min_cell_paste[1], | 71 Log10_Frequency=sum(.SD$Log10_Frequency), |
| 71 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 72 Clone_Sequence=.SD$Clone_Sequence[1], |
| 72 | 73 min_cell_paste=.SD$min_cell_paste[1], |
| 74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
| 75 } | |
| 73 | 76 |
| 74 patients = split(dat, dat$Patient, drop=T) | 77 patients = split(dat, dat$Patient, drop=T) |
| 75 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) | 78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) |
| 76 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) | 79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) |
| 77 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
| 84 | 87 |
| 85 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ | 88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ |
| 86 if (!is.data.frame(x) & is.list(x)){ | 89 if (!is.data.frame(x) & is.list(x)){ |
| 87 x = x[[1]] | 90 x = x[[1]] |
| 88 } | 91 } |
| 89 x$Sample = factor(x$Sample, levels=unique(x$Sample)) | 92 #x$Sample = factor(x$Sample, levels=unique(x$Sample)) |
| 93 x = data.frame(x,stringsAsFactors=F) | |
| 90 onShort = "reads" | 94 onShort = "reads" |
| 91 if(on == "Frequency"){ | 95 if(on == "Frequency"){ |
| 92 onShort = "freq" | 96 onShort = "freq" |
| 93 } | 97 } |
| 94 onx = paste(on, ".x", sep="") | 98 onx = paste(on, ".x", sep="") |
| 141 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] | 145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] |
| 142 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) | 146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) |
| 143 scatterplot_data$on = onShort | 147 scatterplot_data$on = onShort |
| 144 | 148 |
| 145 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
| 150 | |
| 151 | |
| 152 #fuzzy matching here... | |
| 153 if(mergeOn == "Clone_Sequence"){ | |
| 154 merge.list = patientMerge$merge | |
| 155 | |
| 156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] | |
| 157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] | |
| 158 | |
| 159 patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) | |
| 160 patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) | |
| 161 | |
| 162 merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge))) | |
| 163 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] | |
| 164 | |
| 165 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
| 166 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
| 167 | |
| 168 while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){ | |
| 169 current.merge.1 = patient1.fuzzy[1,"merge"] | |
| 170 current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"] | |
| 171 current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,] | |
| 172 | |
| 173 #agrep/adist the two samples | |
| 174 agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1)) | |
| 175 | |
| 176 | |
| 177 if(length(agrep.match) == 1){ | |
| 178 current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"] | |
| 179 patientMerge.new.row = data.frame(merge=current.clone.seq.1, | |
| 180 min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"], | |
| 181 Patient.x=patient1.fuzzy[1,"Patient"], | |
| 182 Receptor.x=patient1.fuzzy[1,"Receptor"], | |
| 183 Sample.x=patient1.fuzzy[1,"Sample"], | |
| 184 Cell_Count.x=patient1.fuzzy[1,"Cell_Count"], | |
| 185 Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"], | |
| 186 Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"], | |
| 187 Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"], | |
| 188 dsPerM.x=patient1.fuzzy[1,"dsPerM"], | |
| 189 J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"], | |
| 190 V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"], | |
| 191 Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"], | |
| 192 CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"], | |
| 193 Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"], | |
| 194 Frequency.x=patient1.fuzzy[1,"Frequency"], | |
| 195 locus_V.x=patient1.fuzzy[1,"locus_V"], | |
| 196 locus_J.x=patient1.fuzzy[1,"locus_J"], | |
| 197 min_cell_count.x=patient1.fuzzy[1,"min_cell_count"], | |
| 198 normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"], | |
| 199 paste.x=patient1.fuzzy[1,"paste"], | |
| 200 min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"], | |
| 201 Patient.y=patient2.fuzzy[agrep.match,"Patient"], | |
| 202 Receptor.y=patient2.fuzzy[agrep.match,"Receptor"], | |
| 203 Sample.y=patient2.fuzzy[agrep.match,"Sample"], | |
| 204 Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"], | |
| 205 Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"], | |
| 206 Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"], | |
| 207 Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"], | |
| 208 dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"], | |
| 209 J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"], | |
| 210 V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"], | |
| 211 Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"], | |
| 212 CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"], | |
| 213 Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"], | |
| 214 Frequency.y=patient2.fuzzy[agrep.match,"Frequency"], | |
| 215 locus_V.y=patient2.fuzzy[agrep.match,"locus_V"], | |
| 216 locus_J.y=patient2.fuzzy[agrep.match,"locus_J"], | |
| 217 min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"], | |
| 218 normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"], | |
| 219 paste.y=patient2.fuzzy[agrep.match,"paste"]) | |
| 220 #add to patientMerge | |
| 221 patientMerge = rbind(patientMerge, patientMerge.new.row) | |
| 222 #remove from patient*.fuzzy | |
| 223 | |
| 224 | |
| 225 #remove the fuzzy merged clone sequences from the original datasets | |
| 226 patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],] | |
| 227 patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],] | |
| 228 | |
| 229 scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,] | |
| 230 | |
| 231 patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),] | |
| 232 | |
| 233 | |
| 234 } else if (length(agrep.match) > 1){ | |
| 235 #multiple matches, whatdo? | |
| 236 cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, " in ", patient, "</td></tr>", sep=""), file=logfile, append=T) | |
| 237 } | |
| 238 patient1.fuzzy = patient1.fuzzy[-1,] | |
| 239 } | |
| 240 | |
| 241 #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1)) | |
| 242 } | |
| 243 | |
| 146 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) | 244 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) |
| 147 res1 = vector() | 245 res1 = vector() |
| 148 res2 = vector() | 246 res2 = vector() |
| 149 resBoth = vector() | 247 resBoth = vector() |
| 150 read1Count = vector() | 248 read1Count = vector() |
| 185 } else { | 283 } else { |
| 186 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] | 284 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] |
| 187 if(nrow(scatterplot_locus_data) > 0){ | 285 if(nrow(scatterplot_locus_data) > 0){ |
| 188 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] | 286 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] |
| 189 } | 287 } |
| 288 in_one = (scatterplot_locus_data$merge %in% patient1$merge) | |
| 289 in_two = (scatterplot_locus_data$merge %in% patient2$merge) | |
| 290 not_in_one = !in_one | |
| 291 if(any(in_two)){ | |
| 292 scatterplot_locus_data[not_in_one,]$type = twoSample | |
| 293 } | |
| 190 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) | 294 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) |
| 191 if(any(in_both)){ | 295 if(any(in_both)){ |
| 192 scatterplot_locus_data[in_both,]$type = "In Both" | 296 scatterplot_locus_data[in_both,]$type = "In Both" |
| 193 } | |
| 194 in_one = (scatterplot_locus_data$merge %in% patient1$merge) | |
| 195 not_in_one = !in_one | |
| 196 if(any(not_in_one)){ | |
| 197 scatterplot_locus_data[not_in_one,]$type = twoSample | |
| 198 } | 297 } |
| 199 if(type == "single"){ | 298 if(type == "single"){ |
| 200 single_patients <<- rbind(single_patients, scatterplot_locus_data) | 299 single_patients <<- rbind(single_patients, scatterplot_locus_data) |
| 201 } | 300 } |
| 202 p = NULL | 301 p = NULL |
| 203 if(nrow(scatterplot_locus_data) != 0){ | 302 if(nrow(scatterplot_locus_data) != 0){ |
| 204 if(on == "normalized_read_count"){ | 303 if(on == "normalized_read_count"){ |
| 205 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) | 304 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) |
| 206 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) | 305 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) |
| 207 } else { | 306 } else { |
| 208 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) | 307 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) |
| 209 } | 308 } |
| 210 p = p + geom_point(aes(colour=type), position="jitter") | 309 p = p + geom_point(aes(colour=type), position="jitter") |
| 211 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) | 310 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) |
