comparison RScript.r @ 51:17e677c72e49 draft

Uploaded
author davidvanzessen
date Fri, 09 Oct 2015 06:58:17 -0400
parents 7dd7cefcf72d
children c5c2a790d476
comparison
equal deleted inserted replaced
50:7dd7cefcf72d 51:17e677c72e49
63 Titles = factor(Titles, levels=Titles) 63 Titles = factor(Titles, levels=Titles)
64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles)) 64 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
65 65
66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0)) 66 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
67 67
68 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
69 patient.merge.list.second = list()
70
68 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ 71 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
69 if (!is.data.frame(x) & is.list(x)){ 72 if (!is.data.frame(x) & is.list(x)){
70 x = x[[1]] 73 x = x[[1]]
71 } 74 }
72 #x$Sample = factor(x$Sample, levels=unique(x$Sample)) 75 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
108 switched = T 111 switched = T
109 } 112 }
110 if(appendtxt){ 113 if(appendtxt){
111 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) 114 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
112 } 115 }
113 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) 116 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
114 117
115 if(mergeOn == "Clone_Sequence"){ 118 if(mergeOn == "Clone_Sequence"){
116 patient1$merge = paste(patient1$Clone_Sequence) 119 patient1$merge = paste(patient1$Clone_Sequence)
117 patient2$merge = paste(patient2$Clone_Sequence) 120 patient2$merge = paste(patient2$Clone_Sequence)
118 } else { 121 } else {
129 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy' 132 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
130 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh 133 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
131 134
132 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence 135 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
133 136
134 137 start.time = proc.time()
135 #fuzzy matching here... 138 merge.list = c()
136 if(mergeOn == "Clone_Sequence"){ 139
140 if(patient %in% names(patient.merge.list)){
141 patientMerge = patient.merge.list[[patient]]
142 merge.list[["second"]] = patient.merge.list.second[[patient]]
143 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
144
145 print(names(patient.merge.list))
146 } else {
147 #fuzzy matching here...
137 #merge.list = patientMerge$merge 148 #merge.list = patientMerge$merge
138 149
139 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] 150 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
140 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] 151 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
141 152
142 patient1.fuzzy = patient1 153 patient1.fuzzy = patient1
143 patient2.fuzzy = patient2 154 patient2.fuzzy = patient2
144 155
145 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) 156 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
146 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) 157 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
147 158
148 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence) 159 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
149 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence) 160 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
150 161
151 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J) 162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
152 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J) 163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
153 164
154 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove? 165 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
155 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] 166 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
156 167
157 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] 168 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
158 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] 169 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
159 170
160 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy) 171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
161 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),] 172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
162 173
163 merge.list = list() 174 merge.list = list()
164 175
168 while(nrow(patient.fuzzy) > 1){ 179 while(nrow(patient.fuzzy) > 1){
169 first.merge = patient.fuzzy[1,"merge"] 180 first.merge = patient.fuzzy[1,"merge"]
170 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] 181 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
171 first.sample = patient.fuzzy[1,"Sample"] 182 first.sample = patient.fuzzy[1,"Sample"]
172 merge.filter = first.merge == patient.fuzzy$merge 183 merge.filter = first.merge == patient.fuzzy$merge
173 184
174 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9 185 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
175 186
176 first.sample.filter = first.sample == patient.fuzzy$Sample 187 first.sample.filter = first.sample == patient.fuzzy$Sample
177 second.sample.filter = first.sample != patient.fuzzy$Sample 188 second.sample.filter = first.sample != patient.fuzzy$Sample
178 189
179 #first match same sample, sum to a single row, same for other sample 190 #first match same sample, sum to a single row, same for other sample
180 #then merge rows like 'normal' 191 #then merge rows like 'normal'
181 192
182 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence) 193 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
183 194
184 195
185 196
186 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter 197 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
264 275
265 } else { 276 } else {
266 patient.fuzzy = patient.fuzzy[-1,] 277 patient.fuzzy = patient.fuzzy[-1,]
267 } 278 }
268 } 279 }
269 280 patient.merge.list[[patient]] <<- patientMerge
270 } 281 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
271 282 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
283 }
284
285 print(names(patient.merge.list))
286
272 287
273 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) 288 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
274 res1 = vector() 289 res1 = vector()
275 res2 = vector() 290 res2 = vector()
276 resBoth = vector() 291 resBoth = vector()