clonal_sequences_in_paired_samples: RScript.r comparison

comparison RScript.r @ 34:37d9074ef2c6 draft

Uploaded

author	davidvanzessen
date	Wed, 26 Aug 2015 05:19:55 -0400
parents	642b4593f0a4
children	32d8a5abed4c

comparison

equal deleted inserted replaced

-:642b4593f0a4
+:37d9074ef2c6
 library(grid)
 library(parallel)
 #require(xtable)
 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
+dat = dat[,c("Patient",  "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
+dat$dsPerM = 0
 dat = dat[!is.na(dat$Patient),]
 dat$Related_to_leukemia_clone = F
 setwd(outDir)
 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
+print(paste("rows:", nrow(dat)))
 dat = merge(dat, min_cell_count, by="min_cell_paste")
+print(paste("rows:", nrow(dat)))
 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
 dat = dat[dat$normalized_read_count >= min_cells,]
 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
-cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
 #remove duplicate V+J+CDR3, add together numerical values
-dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
+if(mergeOn != "Clone_Sequence"){
-Cell_Count=unique(.SD$Cell_Count),
+cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
-Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
+dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
-Total_Read_Count=sum(.SD$Total_Read_Count),
+Cell_Count=unique(.SD$Cell_Count),
-dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
+Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
-Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
+Total_Read_Count=sum(.SD$Total_Read_Count),
-Frequency=sum(.SD$Frequency),
+dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
-locus_V=unique(.SD$locus_V),
+Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
-locus_J=unique(.SD$locus_J),
+Frequency=sum(.SD$Frequency),
-min_cell_count=unique(.SD$min_cell_count),
+locus_V=unique(.SD$locus_V),
-normalized_read_count=sum(.SD$normalized_read_count),
+locus_J=unique(.SD$locus_J),
-Log10_Frequency=sum(.SD$Log10_Frequency),
+min_cell_count=unique(.SD$min_cell_count),
-Clone_Sequence=.SD$Clone_Sequence[1],
+normalized_read_count=sum(.SD$normalized_read_count),
-min_cell_paste=.SD$min_cell_paste[1],
+Log10_Frequency=sum(.SD$Log10_Frequency),
-paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
+Clone_Sequence=.SD$Clone_Sequence[1],
+min_cell_paste=.SD$min_cell_paste[1],
+paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
+}
 patients = split(dat, dat$Patient, drop=T)
 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
 if (!is.data.frame(x) & is.list(x)){
 x = x[[1]]
 }
-x$Sample = factor(x$Sample, levels=unique(x$Sample))
+#x$Sample = factor(x$Sample, levels=unique(x$Sample))
+x = data.frame(x,stringsAsFactors=F)
 onShort = "reads"
 if(on == "Frequency"){
 onShort = "freq"
 }
 onx = paste(on, ".x", sep="")
 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
 scatterplot_data$on = onShort
 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
+#fuzzy matching here...
+if(mergeOn == "Clone_Sequence"){
+merge.list = patientMerge$merge
+patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
+patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
+patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
+patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
+merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
+merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
+patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
+patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
+while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
+current.merge.1 = patient1.fuzzy[1,"merge"]
+current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
+current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
+#agrep/adist the two samples
+agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1))
+if(length(agrep.match) == 1){
+				current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
+patientMerge.new.row = data.frame(merge=current.clone.seq.1,
+min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
+Patient.x=patient1.fuzzy[1,"Patient"],
+Receptor.x=patient1.fuzzy[1,"Receptor"],
+Sample.x=patient1.fuzzy[1,"Sample"],
+Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
+Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
+Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
+Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
+dsPerM.x=patient1.fuzzy[1,"dsPerM"],
+J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
+V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
+Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
+CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
+Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
+Frequency.x=patient1.fuzzy[1,"Frequency"],
+locus_V.x=patient1.fuzzy[1,"locus_V"],
+locus_J.x=patient1.fuzzy[1,"locus_J"],
+min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
+normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
+paste.x=patient1.fuzzy[1,"paste"],
+min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
+Patient.y=patient2.fuzzy[agrep.match,"Patient"],
+Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
+Sample.y=patient2.fuzzy[agrep.match,"Sample"],
+Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
+Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
+Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
+Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
+dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
+J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
+V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
+Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
+CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
+Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
+Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
+locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
+locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
+min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
+normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
+paste.y=patient2.fuzzy[agrep.match,"paste"])
+				#add to patientMerge
+patientMerge = rbind(patientMerge, patientMerge.new.row)
+#remove from patient*.fuzzy
+#remove the fuzzy merged clone sequences from the original datasets
+patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
+patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
+scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
+patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
+} else if (length(agrep.match) > 1){
+#multiple matches, whatdo?
+cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, " in ", patient, "</td></tr>", sep=""), file=logfile, append=T)
+}
+patient1.fuzzy = patient1.fuzzy[-1,]
+}
+#adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
+}
 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
 res1 = vector()
 res2 = vector()
 resBoth = vector()
 read1Count = vector()
 } else {
 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
 if(nrow(scatterplot_locus_data) > 0){
 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
 }
+in_one = (scatterplot_locus_data$merge %in% patient1$merge)
+in_two = (scatterplot_locus_data$merge %in% patient2$merge)
+not_in_one = !in_one
+if(any(in_two)){
+scatterplot_locus_data[not_in_one,]$type = twoSample
+}
 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
 if(any(in_both)){
 scatterplot_locus_data[in_both,]$type = "In Both"
-}
-in_one = (scatterplot_locus_data$merge %in% patient1$merge)
-not_in_one = !in_one
-if(any(not_in_one)){
-				scatterplot_locus_data[not_in_one,]$type = twoSample
 }
 if(type == "single"){
 single_patients <<- rbind(single_patients, scatterplot_locus_data)
 }
 p = NULL
 if(nrow(scatterplot_locus_data) != 0){
 if(on == "normalized_read_count"){
 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
-p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
+p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
 } else {
 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
 }
 p = p + geom_point(aes(colour=type), position="jitter")
 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))

Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

comparison RScript.r @ 34:37d9074ef2c6 draft