Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 4:f11df36f43bb draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 15 Sep 2014 05:37:16 -0400 |
parents | f9316f7676cc |
children | 9641f3dfc590 |
comparison
equal
deleted
inserted
replaced
3:f9316f7676cc | 4:f11df36f43bb |
---|---|
6 min_freq = as.numeric(args[4]) | 6 min_freq = as.numeric(args[4]) |
7 min_cells = as.numeric(args[5]) | 7 min_cells = as.numeric(args[5]) |
8 | 8 |
9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) | 9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) |
10 | 10 |
11 require(ggplot2) | 11 library(ggplot2) |
12 require(reshape2) | 12 library(reshape2) |
13 require(data.table) | 13 library(data.table) |
14 require(grid) | 14 library(grid) |
15 library(parallel) | |
15 #require(xtable) | 16 #require(xtable) |
16 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
17 dat = read.csv(inFile, sep="\t") | 18 dat = read.csv(inFile, sep="\t") |
18 #dat = data.frame(fread(inFile)) #faster but with a dep | 19 #dat = data.frame(fread(inFile)) #faster but with a dep |
19 setwd(outDir) | 20 setwd(outDir) |
31 dat = dat[dat$normalized_read_count >= min_cells,] | 32 dat = dat[dat$normalized_read_count >= min_cells,] |
32 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 33 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) |
33 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) | 34 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) |
34 dat = dat[!duplicated(dat$paste),] | 35 dat = dat[!duplicated(dat$paste),] |
35 patients = split(dat, dat$Patient, drop=T) | 36 patients = split(dat, dat$Patient, drop=T) |
36 intervalReads = rev(c(0,2,10,100,1000,10000)) | 37 rm(dat) |
38 patients = patients[1:5] | |
39 intervalReads = rev(c(0,10,25,50,100,1000,10000)) | |
37 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) | 40 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) |
38 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 41 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
39 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") | 42 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") |
40 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") | 43 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") |
41 Titles = factor(Titles, levels=Titles) | 44 Titles = factor(Titles, levels=Titles) |
46 onShort = "reads" | 49 onShort = "reads" |
47 if(on == "Frequency"){ | 50 if(on == "Frequency"){ |
48 onShort = "freq" | 51 onShort = "freq" |
49 } | 52 } |
50 splt = split(x, x$Sample, drop=T) | 53 splt = split(x, x$Sample, drop=T) |
54 type="pair" | |
51 if(length(splt) == 1){ | 55 if(length(splt) == 1){ |
52 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) | 56 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) |
53 splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a') | 57 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0)) |
58 type="single" | |
54 } | 59 } |
55 patient1 = splt[[1]] | 60 patient1 = splt[[1]] |
56 patient2 = splt[[2]] | 61 patient2 = splt[[2]] |
57 | 62 |
58 threshholdIndex = which(colnames(product) == "interval") | 63 threshholdIndex = which(colnames(product) == "interval") |
74 patient1 = patient2 | 79 patient1 = patient2 |
75 patient2 = tmp | 80 patient2 = tmp |
76 switched = T | 81 switched = T |
77 } | 82 } |
78 if(appendtxt){ | 83 if(appendtxt){ |
79 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | 84 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) |
80 } | 85 } |
81 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 86 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
82 patientMerge = merge(patient1, patient2, by="Clone_Sequence") | 87 patientMerge = merge(patient1, patient2, by="Clone_Sequence") |
83 res1 = vector() | 88 res1 = vector() |
84 res2 = vector() | 89 res2 = vector() |
105 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) | 110 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) |
106 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) | 111 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) |
107 #threshhold = 0 | 112 #threshhold = 0 |
108 if(threshhold != 0){ | 113 if(threshhold != 0){ |
109 if(sum(one) > 0){ | 114 if(sum(one) > 0){ |
110 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] | 115 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] |
111 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") | 116 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
112 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 117 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
113 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 118 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
114 } | 119 } |
115 if(sum(two) > 0){ | 120 if(sum(two) > 0){ |
116 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] | 121 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] |
117 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") | 122 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
118 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 123 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
119 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 124 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
120 } | 125 } |
121 } | 126 } |
122 if(sum(both) > 0){ | 127 if(sum(both) > 0){ |
123 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y")] | 128 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] |
124 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), "Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample)) | 129 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample)) |
125 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 130 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
126 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 131 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
127 } | 132 } |
128 } | 133 } |
129 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) | 134 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) |
177 | 182 |
178 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) | 183 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) |
179 | 184 |
180 interval = intervalFreq | 185 interval = intervalFreq |
181 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 186 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
182 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 187 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
183 #patientFrequencyCount(patient1) | 188 #patientFrequencyCount(patient1) |
184 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) | 189 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) |
185 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 190 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
186 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 191 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
187 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 192 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
188 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 193 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
189 | 194 |
190 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) | 195 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) |
191 | 196 |
192 interval = intervalReads | 197 interval = intervalReads |
193 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 198 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
194 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 199 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
195 #patientResult = patientReadCount(patient1) | 200 #patientResult = patientReadCount(patient1) |
196 #lapply(patients[c(5,6,10)], FUN=patientReadCount) | 201 #lapply(patients[c(5,6,10)], FUN=patientReadCount) |
197 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 202 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
198 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 203 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
199 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 204 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
200 | 205 |
201 cat("</table></html>", file=logfile, append=T) | 206 cat("</table></html>", file=logfile, append=T) |
202 | 207 |