Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 12:eb5b569b44dd draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Wed, 10 Dec 2014 09:49:16 -0500 |
| parents | bc4612998d50 |
| children | 576de7c96c4f |
comparison
equal
deleted
inserted
replaced
| 11:bc4612998d50 | 12:eb5b569b44dd |
|---|---|
| 15 library(parallel) | 15 library(parallel) |
| 16 #require(xtable) | 16 #require(xtable) |
| 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
| 18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) | 18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) |
| 19 dat = dat[!is.na(dat$Patient),] | 19 dat = dat[!is.na(dat$Patient),] |
| 20 dat = dat[!duplicated(dat$Clone_Sequence), ] | |
| 20 | 21 |
| 21 setwd(outDir) | 22 setwd(outDir) |
| 22 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
| 23 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) |
| 24 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) |
| 25 | 26 |
| 26 str(dat) | 27 dat$Frequency = ((10^dat$Log10_Frequency)*100) |
| 28 | |
| 27 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) | 29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) |
| 28 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) |
| 29 | 31 |
| 30 most.common = function(x){ | 32 most.common = function(x, ret="V"){ |
| 31 ux = unique(x) | 33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") |
| 34 ux = unique(past) | |
| 32 if(length(ux) > 1){ | 35 if(length(ux) > 1){ |
| 33 xtdf = data.frame(table(x)) | 36 xtdf = data.frame(table(past)) |
| 34 return(xtdf$Var1[which.max(xtdf$Freq)]) | |
| 35 #print(xtdf) | 37 #print(xtdf) |
| 36 } | 38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) |
| 37 return(unique(x)) | 39 #print(res) |
| 40 if(ret == "V"){ | |
| 41 return(res[1]) | |
| 42 } else { | |
| 43 return(res[2]) | |
| 44 } | |
| 45 } | |
| 46 | |
| 47 if(ret == "V"){ | |
| 48 return(unique(x$V_Segment_Major_Gene)) | |
| 49 } else { | |
| 50 return(unique(x$J_Segment_Major_Gene)) | |
| 51 } | |
| 38 } | 52 } |
| 39 | 53 |
| 40 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene=most.common(.SD$V_Segment_Major_Gene), J_Segment_Major_Gene=most.common(.SD$J_Segment_Major_Gene), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) | 54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) |
| 55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
| 41 | 56 |
| 42 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | 57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) |
| 43 dat$Frequency = ((10^dat$Log10_Frequency)*100) | 58 |
| 44 | 59 |
| 45 dat = dat[dat$Frequency >= min_freq,] | 60 dat = dat[dat$Frequency >= min_freq,] |
| 46 | 61 |
| 47 cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) | 62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) |
| 48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) | 63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) |
| 64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes | |
| 49 dat = dat[dat$normalized_read_count >= min_cells,] | 65 dat = dat[dat$normalized_read_count >= min_cells,] |
| 50 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) |
| 51 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] | 67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] |
| 52 | 68 |
| 53 patients = split(dat, dat$Patient, drop=T) | 69 patients = split(dat, dat$Patient, drop=T) |
| 98 if(appendtxt){ | 114 if(appendtxt){ |
| 99 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | 115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) |
| 100 } | 116 } |
| 101 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
| 102 | 118 |
| 103 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
| 104 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
| 105 | 121 patient1$merge = paste(patient1$CDR3_Sense_Sequence) |
| 122 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | |
| 123 | |
| 124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | |
| 106 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
| 107 res1 = vector() | 126 res1 = vector() |
| 108 res2 = vector() | 127 res2 = vector() |
| 109 resBoth = vector() | 128 resBoth = vector() |
| 110 read1Count = vector() | 129 read1Count = vector() |
| 231 patientIndex = which(colnames(patient1) == "Patient") | 250 patientIndex = which(colnames(patient1) == "Patient") |
| 232 oneSample = paste(patient1[1,sampleIndex], sep="") | 251 oneSample = paste(patient1[1,sampleIndex], sep="") |
| 233 twoSample = paste(patient2[1,sampleIndex], sep="") | 252 twoSample = paste(patient2[1,sampleIndex], sep="") |
| 234 threeSample = paste(patient3[1,sampleIndex], sep="") | 253 threeSample = paste(patient3[1,sampleIndex], sep="") |
| 235 | 254 |
| 236 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 255 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
| 237 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 256 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
| 238 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) | 257 #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) |
| 258 | |
| 259 patient1$merge = paste(patient1$CDR3_Sense_Sequence) | |
| 260 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | |
| 261 patient3$merge = paste(patient3$CDR3_Sense_Sequence) | |
| 239 | 262 |
| 240 patientMerge = merge(patient1, patient2, by="merge") | 263 patientMerge = merge(patient1, patient2, by="merge") |
| 241 patientMerge = merge(patientMerge, patient3, by="merge") | 264 patientMerge = merge(patientMerge, patient3, by="merge") |
| 242 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") | 265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") |
| 243 res1 = vector() | 266 res1 = vector() |
| 269 resAll = append(resAll, sum(all)) | 292 resAll = append(resAll, sum(all)) |
| 270 #threshhold = 0 | 293 #threshhold = 0 |
| 271 if(threshhold != 0){ | 294 if(threshhold != 0){ |
| 272 if(sum(one) > 0){ | 295 if(sum(one) > 0){ |
| 273 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 274 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
| 275 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") | 298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 276 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 277 } | 300 } |
| 278 if(sum(two) > 0){ | 301 if(sum(two) > 0){ |
| 279 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 280 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
| 281 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") | 304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 282 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 283 } | 306 } |
| 284 if(sum(three) > 0){ | 307 if(sum(three) > 0){ |
| 285 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
| 286 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
| 287 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | 310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 288 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 289 } | 312 } |
| 290 } | 313 } |
| 291 if(sum(all) > 0){ | 314 if(sum(all) > 0){ |
