Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 4:f11df36f43bb draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 15 Sep 2014 05:37:16 -0400 |
| parents | f9316f7676cc |
| children | 9641f3dfc590 |
comparison
equal
deleted
inserted
replaced
| 3:f9316f7676cc | 4:f11df36f43bb |
|---|---|
| 6 min_freq = as.numeric(args[4]) | 6 min_freq = as.numeric(args[4]) |
| 7 min_cells = as.numeric(args[5]) | 7 min_cells = as.numeric(args[5]) |
| 8 | 8 |
| 9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) | 9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) |
| 10 | 10 |
| 11 require(ggplot2) | 11 library(ggplot2) |
| 12 require(reshape2) | 12 library(reshape2) |
| 13 require(data.table) | 13 library(data.table) |
| 14 require(grid) | 14 library(grid) |
| 15 library(parallel) | |
| 15 #require(xtable) | 16 #require(xtable) |
| 16 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | 17 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) |
| 17 dat = read.csv(inFile, sep="\t") | 18 dat = read.csv(inFile, sep="\t") |
| 18 #dat = data.frame(fread(inFile)) #faster but with a dep | 19 #dat = data.frame(fread(inFile)) #faster but with a dep |
| 19 setwd(outDir) | 20 setwd(outDir) |
| 31 dat = dat[dat$normalized_read_count >= min_cells,] | 32 dat = dat[dat$normalized_read_count >= min_cells,] |
| 32 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 33 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) |
| 33 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) | 34 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) |
| 34 dat = dat[!duplicated(dat$paste),] | 35 dat = dat[!duplicated(dat$paste),] |
| 35 patients = split(dat, dat$Patient, drop=T) | 36 patients = split(dat, dat$Patient, drop=T) |
| 36 intervalReads = rev(c(0,2,10,100,1000,10000)) | 37 rm(dat) |
| 38 patients = patients[1:5] | |
| 39 intervalReads = rev(c(0,10,25,50,100,1000,10000)) | |
| 37 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) | 40 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) |
| 38 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 41 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
| 39 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") | 42 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") |
| 40 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") | 43 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") |
| 41 Titles = factor(Titles, levels=Titles) | 44 Titles = factor(Titles, levels=Titles) |
| 46 onShort = "reads" | 49 onShort = "reads" |
| 47 if(on == "Frequency"){ | 50 if(on == "Frequency"){ |
| 48 onShort = "freq" | 51 onShort = "freq" |
| 49 } | 52 } |
| 50 splt = split(x, x$Sample, drop=T) | 53 splt = split(x, x$Sample, drop=T) |
| 54 type="pair" | |
| 51 if(length(splt) == 1){ | 55 if(length(splt) == 1){ |
| 52 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) | 56 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) |
| 53 splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a') | 57 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0)) |
| 58 type="single" | |
| 54 } | 59 } |
| 55 patient1 = splt[[1]] | 60 patient1 = splt[[1]] |
| 56 patient2 = splt[[2]] | 61 patient2 = splt[[2]] |
| 57 | 62 |
| 58 threshholdIndex = which(colnames(product) == "interval") | 63 threshholdIndex = which(colnames(product) == "interval") |
| 74 patient1 = patient2 | 79 patient1 = patient2 |
| 75 patient2 = tmp | 80 patient2 = tmp |
| 76 switched = T | 81 switched = T |
| 77 } | 82 } |
| 78 if(appendtxt){ | 83 if(appendtxt){ |
| 79 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | 84 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) |
| 80 } | 85 } |
| 81 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | 86 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) |
| 82 patientMerge = merge(patient1, patient2, by="Clone_Sequence") | 87 patientMerge = merge(patient1, patient2, by="Clone_Sequence") |
| 83 res1 = vector() | 88 res1 = vector() |
| 84 res2 = vector() | 89 res2 = vector() |
| 105 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) | 110 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) |
| 106 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) | 111 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) |
| 107 #threshhold = 0 | 112 #threshhold = 0 |
| 108 if(threshhold != 0){ | 113 if(threshhold != 0){ |
| 109 if(sum(one) > 0){ | 114 if(sum(one) > 0){ |
| 110 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] | 115 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] |
| 111 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") | 116 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
| 112 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 117 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 113 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 118 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 114 } | 119 } |
| 115 if(sum(two) > 0){ | 120 if(sum(two) > 0){ |
| 116 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence")] | 121 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] |
| 117 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence") | 122 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") |
| 118 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 123 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 119 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 124 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 120 } | 125 } |
| 121 } | 126 } |
| 122 if(sum(both) > 0){ | 127 if(sum(both) > 0){ |
| 123 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y")] | 128 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] |
| 124 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), "Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample)) | 129 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample)) |
| 125 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | 130 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") |
| 126 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | 131 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) |
| 127 } | 132 } |
| 128 } | 133 } |
| 129 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) | 134 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) |
| 177 | 182 |
| 178 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) | 183 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) |
| 179 | 184 |
| 180 interval = intervalFreq | 185 interval = intervalFreq |
| 181 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 186 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
| 182 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 187 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
| 183 #patientFrequencyCount(patient1) | 188 #patientFrequencyCount(patient1) |
| 184 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) | 189 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) |
| 185 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 190 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
| 186 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 191 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
| 187 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 192 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
| 188 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 193 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
| 189 | 194 |
| 190 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) | 195 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) |
| 191 | 196 |
| 192 interval = intervalReads | 197 interval = intervalReads |
| 193 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 198 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
| 194 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 199 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) |
| 195 #patientResult = patientReadCount(patient1) | 200 #patientResult = patientReadCount(patient1) |
| 196 #lapply(patients[c(5,6,10)], FUN=patientReadCount) | 201 #lapply(patients[c(5,6,10)], FUN=patientReadCount) |
| 197 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 202 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
| 198 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 203 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
| 199 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 204 mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
| 200 | 205 |
| 201 cat("</table></html>", file=logfile, append=T) | 206 cat("</table></html>", file=logfile, append=T) |
| 202 | 207 |
