Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
comparison RScript.r @ 3:f9316f7676cc draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 26 Aug 2014 09:53:22 -0400 |
parents | 8d562506f4f9 |
children | f11df36f43bb |
comparison
equal
deleted
inserted
replaced
2:8d562506f4f9 | 3:f9316f7676cc |
---|---|
1 args <- commandArgs(trailingOnly = TRUE) | 1 args <- commandArgs(trailingOnly = TRUE) |
2 | 2 |
3 inFile = args[1] | 3 inFile = args[1] |
4 outDir = args[2] | 4 outDir = args[2] |
5 logfile = args[3] | |
6 min_freq = as.numeric(args[4]) | |
7 min_cells = as.numeric(args[5]) | |
8 | |
9 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) | |
5 | 10 |
6 require(ggplot2) | 11 require(ggplot2) |
7 require(reshape2) | 12 require(reshape2) |
8 require(data.table) | 13 require(data.table) |
9 require(grid) | 14 require(grid) |
10 #require(xtable) | 15 #require(xtable) |
16 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | |
11 dat = read.csv(inFile, sep="\t") | 17 dat = read.csv(inFile, sep="\t") |
12 #dat = data.frame(fread(inFile)) #faster but with a dep | 18 #dat = data.frame(fread(inFile)) #faster but with a dep |
13 setwd(outDir) | 19 setwd(outDir) |
20 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | |
14 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | 21 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) |
15 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | 22 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) |
16 | 23 |
24 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
17 dat$Frequency = ((10^dat$Log10_Frequency)*100) | 25 dat$Frequency = ((10^dat$Log10_Frequency)*100) |
18 | 26 |
27 dat = dat[dat$Frequency >= min_freq,] | |
28 | |
29 cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) | |
19 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) | 30 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) |
31 dat = dat[dat$normalized_read_count >= min_cells,] | |
20 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) | 32 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) |
33 cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T) | |
21 dat = dat[!duplicated(dat$paste),] | 34 dat = dat[!duplicated(dat$paste),] |
22 patients = split(dat, dat$Patient, drop=T) | 35 patients = split(dat, dat$Patient, drop=T) |
23 intervalReads = rev(c(0,2,10,100,1000,10000)) | 36 intervalReads = rev(c(0,2,10,100,1000,10000)) |
24 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) | 37 intervalFreq = rev(c(0,0.01,0.1,0.5,1,5)) |
25 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | 38 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") |
34 if(on == "Frequency"){ | 47 if(on == "Frequency"){ |
35 onShort = "freq" | 48 onShort = "freq" |
36 } | 49 } |
37 splt = split(x, x$Sample, drop=T) | 50 splt = split(x, x$Sample, drop=T) |
38 if(length(splt) == 1){ | 51 if(length(splt) == 1){ |
39 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample, skipping")) | 52 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) |
40 return() | 53 splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a') |
41 } | 54 } |
42 patient1 = splt[[1]] | 55 patient1 = splt[[1]] |
43 patient2 = splt[[2]] | 56 patient2 = splt[[2]] |
44 | 57 |
45 threshholdIndex = which(colnames(product) == "interval") | 58 threshholdIndex = which(colnames(product) == "interval") |
49 sampleIndex = which(colnames(x) == "Sample") | 62 sampleIndex = which(colnames(x) == "Sample") |
50 patientIndex = which(colnames(x) == "Patient") | 63 patientIndex = which(colnames(x) == "Patient") |
51 oneSample = paste(patient1[1,sampleIndex], sep="") | 64 oneSample = paste(patient1[1,sampleIndex], sep="") |
52 twoSample = paste(patient2[1,sampleIndex], sep="") | 65 twoSample = paste(patient2[1,sampleIndex], sep="") |
53 patient = paste(x[1,patientIndex]) | 66 patient = paste(x[1,patientIndex]) |
54 | 67 |
55 #print(c(length(grep(".*_Right$", twoSample)) == 1, "Right")) | |
56 #print(c(length(grep(".*_Dx_BM$", twoSample)) == 1, "Dx_BM")) | |
57 #print(c(length(grep(".*_Dx$", twoSample)) == 1, "Dx")) | |
58 switched = F | 68 switched = F |
59 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){ | 69 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){ |
60 tmp = twoSample | 70 tmp = twoSample |
61 twoSample = oneSample | 71 twoSample = oneSample |
62 oneSample = tmp | 72 oneSample = tmp |
66 switched = T | 76 switched = T |
67 } | 77 } |
68 if(appendtxt){ | 78 if(appendtxt){ |
69 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | 79 cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3) |
70 } | 80 } |
81 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) | |
71 patientMerge = merge(patient1, patient2, by="Clone_Sequence") | 82 patientMerge = merge(patient1, patient2, by="Clone_Sequence") |
72 res1 = vector() | 83 res1 = vector() |
73 res2 = vector() | 84 res2 = vector() |
74 resBoth = vector() | 85 resBoth = vector() |
75 read1Count = vector() | 86 read1Count = vector() |
162 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080) | 173 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080) |
163 print(plt) | 174 print(plt) |
164 dev.off() | 175 dev.off() |
165 } | 176 } |
166 | 177 |
178 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) | |
179 | |
167 interval = intervalFreq | 180 interval = intervalFreq |
168 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 181 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
169 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 182 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) |
170 #patientFrequencyCount(patient1) | 183 #patientFrequencyCount(patient1) |
171 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) | 184 #lapply(patients[c(5,6,10)], FUN=patientFrequencyCount) |
172 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 185 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
173 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 186 #lapply(patients[c(6,7,8)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
174 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 187 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
175 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | 188 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) |
176 | 189 |
190 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) | |
191 | |
177 interval = intervalReads | 192 interval = intervalReads |
178 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | 193 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) |
179 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) | 194 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6)) |
180 #patientResult = patientReadCount(patient1) | 195 #patientResult = patientReadCount(patient1) |
181 #lapply(patients[c(5,6,10)], FUN=patientReadCount) | 196 #lapply(patients[c(5,6,10)], FUN=patientReadCount) |
182 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 197 #lapply(patients[c(5,6,7,8,13)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
183 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 198 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
184 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") | 199 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes") |
185 | 200 |
201 cat("</table></html>", file=logfile, append=T) | |
202 |