comparison RScript.r @ 3:f9316f7676cc draft

Uploaded
author davidvanzessen
date Tue, 26 Aug 2014 09:53:22 -0400
parents 8d562506f4f9
children f11df36f43bb
comparison
equal deleted inserted replaced
2:8d562506f4f9 3:f9316f7676cc
1 args <- commandArgs(trailingOnly = TRUE) 1 args <- commandArgs(trailingOnly = TRUE)
2 2
3 inFile = args[1] 3 inFile = args[1]
4 outDir = args[2] 4 outDir = args[2]
5 logfile = args[3]
6 min_freq = as.numeric(args[4])
7 min_cells = as.numeric(args[5])
8
9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
5 10
6 require(ggplot2) 11 require(ggplot2)
7 require(reshape2) 12 require(reshape2)
8 require(data.table) 13 require(data.table)
9 require(grid) 14 require(grid)
10 #require(xtable) 15 #require(xtable)
16 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
11 dat = read.csv(inFile, sep="\t") 17 dat = read.csv(inFile, sep="\t")
12 #dat = data.frame(fread(inFile)) #faster but with a dep 18 #dat = data.frame(fread(inFile)) #faster but with a dep
13 setwd(outDir) 19 setwd(outDir)
20 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
14 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) 21 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
15 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) 22 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
16 23
24 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
17 dat$Frequency = ((10^dat$Log10_Frequency)*100) 25 dat$Frequency = ((10^dat$Log10_Frequency)*100)
18 26
27 dat = dat[dat$Frequency >= min_freq,]
28
29 cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T)
19 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) 30 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
31 dat = dat[dat$normalized_read_count >= min_cells,]
20 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) 32 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
33 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T)
21 dat = dat[!duplicated(dat$paste),] 34 dat = dat[!duplicated(dat$paste),]
22 patients = split(dat, dat$Patient, drop=T) 35 patients = split(dat, dat$Patient, drop=T)
23 intervalReads = rev(c(0,2,10,100,1000,10000)) 36 intervalReads = rev(c(0,2,10,100,1000,10000))
24 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) 37 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5))
25 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") 38 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
34 if(on == "Frequency"){ 47 if(on == "Frequency"){
35 onShort = "freq" 48 onShort = "freq"
36 } 49 }
37 splt = split(x, x$Sample, drop=T) 50 splt = split(x, x$Sample, drop=T)
38 if(length(splt) == 1){ 51 if(length(splt) == 1){
39 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample, skipping")) 52 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
40 return() 53 splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a')
41 } 54 }
42 patient1 = splt[[1]] 55 patient1 = splt[[1]]
43 patient2 = splt[[2]] 56 patient2 = splt[[2]]
44 57
45 threshholdIndex = which(colnames(product) == "interval") 58 threshholdIndex = which(colnames(product) == "interval")
49 sampleIndex = which(colnames(x) == "Sample") 62 sampleIndex = which(colnames(x) == "Sample")
50 patientIndex = which(colnames(x) == "Patient") 63 patientIndex = which(colnames(x) == "Patient")
51 oneSample = paste(patient1[1,sampleIndex], sep="") 64 oneSample = paste(patient1[1,sampleIndex], sep="")
52 twoSample = paste(patient2[1,sampleIndex], sep="") 65 twoSample = paste(patient2[1,sampleIndex], sep="")
53 patient = paste(x[1,patientIndex]) 66 patient = paste(x[1,patientIndex])
54 67
55 #print(c(length(grep(".*_Right$", twoSample)) == 1, "Right"))
56 #print(c(length(grep(".*_Dx_BM$", twoSample)) == 1, "Dx_BM"))
57 #print(c(length(grep(".*_Dx$", twoSample)) == 1, "Dx"))
58 switched = F 68 switched = F
59 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){ 69 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
60 tmp = twoSample 70 tmp = twoSample
61 twoSample = oneSample 71 twoSample = oneSample
62 oneSample = tmp 72 oneSample = tmp
66 switched = T 76 switched = T
67 } 77 }
68 if(appendtxt){ 78 if(appendtxt){
69 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) 79 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
70 } 80 }
81 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
71 patientMerge = merge(patient1, patient2, by="Clone_Sequence") 82 patientMerge = merge(patient1, patient2, by="Clone_Sequence")
72 res1 = vector() 83 res1 = vector()
73 res2 = vector() 84 res2 = vector()
74 resBoth = vector() 85 resBoth = vector()
75 read1Count = vector() 86 read1Count = vector()
162 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080) 173 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
163 print(plt) 174 print(plt)
164 dev.off() 175 dev.off()
165 } 176 }
166 177
178 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
179
167 interval = intervalFreq 180 interval = intervalFreq
168 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 181 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
169 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) 182 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6))
170 #patientFrequencyCount(patient1) 183 #patientFrequencyCount(patient1)
171 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) 184 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount)
172 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 185 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
173 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 186 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
174 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 187 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
175 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) 188 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
176 189
190 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
191
177 interval = intervalReads 192 interval = intervalReads
178 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) 193 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
179 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) 194 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6))
180 #patientResult = patientReadCount(patient1) 195 #patientResult = patientReadCount(patient1)
181 #lapply(patients[c(5,6,10)], FUN=patientReadCount) 196 #lapply(patients[c(5,6,10)], FUN=patientReadCount)
182 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 197 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
183 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 198 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
184 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") 199 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
185 200
201 cat("</table></html>", file=logfile, append=T)
202