Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 12:eb5b569b44dd draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 10 Dec 2014 09:49:16 -0500 |
parents | bc4612998d50 |
children | 576de7c96c4f |
comparison
equal
deleted
inserted
replaced
11:bc4612998d50 | 12:eb5b569b44dd |
---|---|
15 library(parallel) | 15 library(parallel) |
16 #require(xtable) | 16 #require(xtable) |
17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) | 18 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) |
19 dat = dat[!is.na(dat$Patient),] | 19 dat = dat[!is.na(dat$Patient),] |
20 dat = dat[!duplicated(dat$Clone_Sequence), ] | |
20 | 21 |
21 setwd(outDir) | 22 setwd(outDir) |
22 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | 23 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) |
23 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | 24 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) |
24 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | 25 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) |
25 | 26 |
26 str(dat) | 27 dat$Frequency = ((10^dat$Log10_Frequency)*100) |
28 | |
27 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) | 29 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) |
28 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | 30 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) |
29 | 31 |
30 most.common = function(x){ | 32 most.common = function(x, ret="V"){ |
31 ux = unique(x) | 33 past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") |
34 ux = unique(past) | |
32 if(length(ux) > 1){ | 35 if(length(ux) > 1){ |
33 xtdf = data.frame(table(x)) | 36 xtdf = data.frame(table(past)) |
34 return(xtdf$Var1[which.max(xtdf$Freq)]) | |
35 #print(xtdf) | 37 #print(xtdf) |
36 } | 38 res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) |
37 return(unique(x)) | 39 #print(res) |
40 if(ret == "V"){ | |
41 return(res[1]) | |
42 } else { | |
43 return(res[2]) | |
44 } | |
45 } | |
46 | |
47 if(ret == "V"){ | |
48 return(unique(x$V_Segment_Major_Gene)) | |
49 } else { | |
50 return(unique(x$J_Segment_Major_Gene)) | |
51 } | |
38 } | 52 } |
39 | 53 |
40 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene=most.common(.SD$V_Segment_Major_Gene), J_Segment_Major_Gene=most.common(.SD$J_Segment_Major_Gene), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) | 54 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) |
55 dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) | |
41 | 56 |
42 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | 57 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) |
43 dat$Frequency = ((10^dat$Log10_Frequency)*100) | 58 |
44 | 59 |
45 dat = dat[dat$Frequency >= min_freq,] | 60 dat = dat[dat$Frequency >= min_freq,] |
46 | 61 |
47 cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) | 62 #cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) |
48 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) | 63 #dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) |
64 dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes | |
49 dat = dat[dat$normalized_read_count >= min_cells,] | 65 dat = dat[dat$normalized_read_count >= min_cells,] |
50 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 66 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) |
51 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] | 67 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] |
52 | 68 |
53 patients = split(dat, dat$Patient, drop=T) | 69 patients = split(dat, dat$Patient, drop=T) |
98 if(appendtxt){ | 114 if(appendtxt){ |
99 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | 115 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) |
100 } | 116 } |
101 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 117 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
102 | 118 |
103 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 119 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
104 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 120 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
105 | 121 patient1$merge = paste(patient1$CDR3_Sense_Sequence) |
122 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | |
123 | |
124 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | |
106 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") | 125 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") |
107 res1 = vector() | 126 res1 = vector() |
108 res2 = vector() | 127 res2 = vector() |
109 resBoth = vector() | 128 resBoth = vector() |
110 read1Count = vector() | 129 read1Count = vector() |
231 patientIndex = which(colnames(patient1) == "Patient") | 250 patientIndex = which(colnames(patient1) == "Patient") |
232 oneSample = paste(patient1[1,sampleIndex], sep="") | 251 oneSample = paste(patient1[1,sampleIndex], sep="") |
233 twoSample = paste(patient2[1,sampleIndex], sep="") | 252 twoSample = paste(patient2[1,sampleIndex], sep="") |
234 threeSample = paste(patient3[1,sampleIndex], sep="") | 253 threeSample = paste(patient3[1,sampleIndex], sep="") |
235 | 254 |
236 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | 255 #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) |
237 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | 256 #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) |
238 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) | 257 #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) |
258 | |
259 patient1$merge = paste(patient1$CDR3_Sense_Sequence) | |
260 patient2$merge = paste(patient2$CDR3_Sense_Sequence) | |
261 patient3$merge = paste(patient3$CDR3_Sense_Sequence) | |
239 | 262 |
240 patientMerge = merge(patient1, patient2, by="merge") | 263 patientMerge = merge(patient1, patient2, by="merge") |
241 patientMerge = merge(patientMerge, patient3, by="merge") | 264 patientMerge = merge(patientMerge, patient3, by="merge") |
242 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") | 265 colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") |
243 res1 = vector() | 266 res1 = vector() |
269 resAll = append(resAll, sum(all)) | 292 resAll = append(resAll, sum(all)) |
270 #threshhold = 0 | 293 #threshhold = 0 |
271 if(threshhold != 0){ | 294 if(threshhold != 0){ |
272 if(sum(one) > 0){ | 295 if(sum(one) > 0){ |
273 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 296 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
274 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 297 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
275 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") | 298 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") |
276 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 299 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
277 } | 300 } |
278 if(sum(two) > 0){ | 301 if(sum(two) > 0){ |
279 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 302 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
280 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 303 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
281 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") | 304 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") |
282 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 305 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
283 } | 306 } |
284 if(sum(three) > 0){ | 307 if(sum(three) > 0){ |
285 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] | 308 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] |
286 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") | 309 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") |
287 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | 310 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") |
288 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 311 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
289 } | 312 } |
290 } | 313 } |
291 if(sum(all) > 0){ | 314 if(sum(all) > 0){ |