comparison RScript.r @ 4:f11df36f43bb draft

Uploaded
author davidvanzessen
date Mon, 15 Sep 2014 05:37:16 -0400
parents f9316f7676cc
children 9641f3dfc590
comparison
equal deleted inserted replaced
3:f9316f7676cc 4:f11df36f43bb
6 min_freq = as.numeric(args[4]) 6 min_freq = as.numeric(args[4])
7 min_cells = as.numeric(args[5]) 7 min_cells = as.numeric(args[5])
8 8
9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) 9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
10 10
11 require(ggplot2) 11 library(ggplot2)
12 require(reshape2) 12 library(reshape2)
13 require(data.table) 13 library(data.table)
14 require(grid) 14 library(grid)
15 library(parallel)
15 #require(xtable) 16 #require(xtable)
16 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
17 dat = read.csv(inFile, sep="\t") 18 dat = read.csv(inFile, sep="\t")
18 #dat = data.frame(fread(inFile)) #faster but with a dep 19 #dat = data.frame(fread(inFile)) #faster but with a dep
19 setwd(outDir) 20 setwd(outDir)
31 dat = dat[dat$normalized_read_count >= min_cells,] 32 dat = dat[dat$normalized_read_count >= min_cells,]
32 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) 33 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
33 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) 34 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T)
34 dat = dat[!duplicated(dat$paste),] 35 dat = dat[!duplicated(dat$paste),]
35 patients = split(dat, dat$Patient, drop=T) 36 patients = split(dat, dat$Patient, drop=T)
36 intervalReads = rev(c(0,2,10,100,1000,10000)) 37 rm(dat)
38 patients = patients[1:5]
39 intervalReads = rev(c(0,10,25,50,100,1000,10000))
37 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) 40 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5))
38 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") 41 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
39 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") 42 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
40 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") 43 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
41 Titles = factor(Titles, levels=Titles) 44 Titles = factor(Titles, levels=Titles)
46 onShort = "reads" 49 onShort = "reads"
47 if(on == "Frequency"){ 50 if(on == "Frequency"){
48 onShort = "freq" 51 onShort = "freq"
49 } 52 }
50 splt = split(x, x$Sample, drop=T) 53 splt = split(x, x$Sample, drop=T)
54 type="pair"
51 if(length(splt) == 1){ 55 if(length(splt) == 1){
52 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) 56 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
53 splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a') 57 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
58 type="single"
54 } 59 }
55 patient1 = splt[[1]] 60 patient1 = splt[[1]]
56 patient2 = splt[[2]] 61 patient2 = splt[[2]]
57 62
58 threshholdIndex = which(colnames(product) == "interval") 63 threshholdIndex = which(colnames(product) == "interval")
74 patient1 = patient2 79 patient1 = patient2
75 patient2 = tmp 80 patient2 = tmp
76 switched = T 81 switched = T
77 } 82 }
78 if(appendtxt){ 83 if(appendtxt){
79 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) 84 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
80 } 85 }
81 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) 86 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
82 patientMerge = merge(patient1, patient2, by="Clone_Sequence") 87 patientMerge = merge(patient1, patient2, by="Clone_Sequence")
83 res1 = vector() 88 res1 = vector()
84 res2 = vector() 89 res2 = vector()
105 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) 110 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
106 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) 111 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
107 #threshhold = 0 112 #threshhold = 0
108 if(threshhold != 0){ 113 if(threshhold != 0){
109 if(sum(one) > 0){ 114 if(sum(one) > 0){
110 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] 115 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
111 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") 116 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
112 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") 117 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
113 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 118 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
114 } 119 }
115 if(sum(two) > 0){ 120 if(sum(two) > 0){
116 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] 121 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
117 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") 122 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
118 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") 123 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
119 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 124 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
120 } 125 }
121 } 126 }
122 if(sum(both) > 0){ 127 if(sum(both) > 0){
123 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y")] 128 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
124 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), "Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample)) 129 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
125 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") 130 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
126 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) 131 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
127 } 132 }
128 } 133 }
129 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) 134 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
177 182
178 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) 183 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
179 184
180 interval = intervalFreq 185 interval = intervalFreq
181 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 186 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
182 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) 187 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
183 #patientFrequencyCount(patient1) 188 #patientFrequencyCount(patient1)
184 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) 189 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount)
185 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 190 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
186 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 191 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
187 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 192 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
188 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 193 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
189 194
190 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) 195 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
191 196
192 interval = intervalReads 197 interval = intervalReads
193 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 198 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
194 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) 199 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
195 #patientResult = patientReadCount(patient1) 200 #patientResult = patientReadCount(patient1)
196 #lapply(patients[c(5,6,10)], FUN=patientReadCount) 201 #lapply(patients[c(5,6,10)], FUN=patientReadCount)
197 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 202 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
198 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 203 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
199 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 204 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
200 205
201 cat("</table></html>", file=logfile, append=T) 206 cat("</table></html>", file=logfile, append=T)
202 207