Mercurial > repos > davidvanzessen > argalaxy_tools
comparison report_clonality/RScript.r @ 59:11ec9edfefee draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 31 Mar 2016 10:26:30 -0400 |
| parents | a073fa12ef98 |
| children | 14ea4c464435 |
comparison
equal
deleted
inserted
replaced
| 58:a073fa12ef98 | 59:11ec9edfefee |
|---|---|
| 62 #filter uniques | 62 #filter uniques |
| 63 inputdata.removed = inputdata[NULL,] | 63 inputdata.removed = inputdata[NULL,] |
| 64 | 64 |
| 65 inputdata$clonaltype = 1:nrow(inputdata) | 65 inputdata$clonaltype = 1:nrow(inputdata) |
| 66 | 66 |
| 67 #keep track of the count of sequences in samples or samples/replicates for the front page overview | |
| 68 input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")]) | |
| 69 input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")]) | |
| 70 | |
| 67 PRODF = inputdata | 71 PRODF = inputdata |
| 68 UNPROD = inputdata | 72 UNPROD = inputdata |
| 69 if(filterproductive){ | 73 if(filterproductive){ |
| 70 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column | 74 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column |
| 71 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] | 75 #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] |
| 72 UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ] | 76 PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),] |
| 77 | |
| 78 PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")]) | |
| 79 | |
| 80 UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ] | |
| 73 } else { | 81 } else { |
| 74 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] | 82 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] |
| 75 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] | 83 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] |
| 76 } | 84 } |
| 77 } | 85 } |
| 86 | |
| 87 prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")]) | |
| 88 prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")]) | |
| 89 | |
| 90 unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")]) | |
| 91 unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")]) | |
| 78 | 92 |
| 79 clonalityFrame = PRODF | 93 clonalityFrame = PRODF |
| 80 | 94 |
| 81 #remove duplicates based on the clonaltype | 95 #remove duplicates based on the clonaltype |
| 82 if(clonaltype != "none"){ | 96 if(clonaltype != "none"){ |
| 83 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples | 97 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples |
| 84 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) | 98 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) |
| 85 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] | 99 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] |
| 86 | 100 |
| 87 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) | 101 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) |
| 88 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] | 102 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] |
| 89 | 103 |
| 90 #again for clonalityFrame but with sample+replicate | 104 #again for clonalityFrame but with sample+replicate |
| 91 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) | 105 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) |
| 92 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) | 106 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) |
| 93 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] | 107 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] |
| 94 } | 108 } |
| 109 | |
| 110 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) | |
| 111 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) | |
| 112 | |
| 113 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")]) | |
| 114 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")]) | |
| 95 | 115 |
| 96 PRODF$freq = 1 | 116 PRODF$freq = 1 |
| 97 | 117 |
| 98 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" | 118 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" |
| 99 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) | 119 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) |
| 120 | 140 |
| 121 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 141 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- |
| 122 | 142 |
| 123 print("Report Clonality - counting productive/unproductive/unique") | 143 print("Report Clonality - counting productive/unproductive/unique") |
| 124 | 144 |
| 125 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data | 145 #create the table on the overview page with the productive/unique counts per sample/replicate |
| 126 inputdata$Functionality = "unproductive" | 146 #first for sample |
| 127 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") | 147 sample.count = merge(input.sample.count, prod.sample.count, by="Sample") |
| 128 if(sum(search) > 0){ | 148 sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100) |
| 129 inputdata[search,]$Functionality = "productive" | 149 sample.count = merge(sample.count, prod.unique.sample.count, by="Sample") |
| 130 } | 150 sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100) |
| 131 } | 151 |
| 132 | 152 sample.count = merge(sample.count , unprod.sample.count, by="Sample") |
| 133 inputdata.dt = data.table(inputdata) #for speed | 153 sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100) |
| 134 | 154 sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample") |
| 135 if(clonaltype == "none"){ | 155 sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100) |
| 136 ct = c("clonaltype") | 156 |
| 137 } | 157 #then sample/replicate |
| 138 | 158 rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate")) |
| 139 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") | 159 rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100) |
| 140 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) | 160 rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate")) |
| 141 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) | 161 rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100) |
| 142 | 162 |
| 143 | 163 rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate")) |
| 144 sample_productive_count = inputdata.dt[, list(All=.N, | 164 rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100) |
| 145 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), | 165 rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate")) |
| 146 perc_prod = 1, | 166 rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100) |
| 147 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), | 167 |
| 148 perc_prod_un = 1, | 168 rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_") |
| 149 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), | 169 rep.count = rep.count[,names(rep.count) != "Replicate"] |
| 150 perc_unprod = 1, | 170 |
| 151 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), | 171 count = rbind(sample.count, rep.count) |
| 152 perc_unprod_un = 1), | 172 |
| 153 by=c("Sample")] | 173 write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) |
| 154 | |
| 155 sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100) | |
| 156 sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100) | |
| 157 | |
| 158 sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100) | |
| 159 sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100) | |
| 160 | |
| 161 sample_replicate_productive_count = inputdata.dt[, list(All=.N, | |
| 162 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), | |
| 163 perc_prod = 1, | |
| 164 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), | |
| 165 perc_prod_un = 1, | |
| 166 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), | |
| 167 perc_unprod = 1, | |
| 168 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), | |
| 169 perc_unprod_un = 1), | |
| 170 by=c("samples_replicates")] | |
| 171 | |
| 172 sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100) | |
| 173 sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100) | |
| 174 | |
| 175 sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100) | |
| 176 sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100) | |
| 177 | |
| 178 setnames(sample_replicate_productive_count, colnames(sample_productive_count)) | |
| 179 | |
| 180 counts = rbind(sample_replicate_productive_count, sample_productive_count) | |
| 181 counts = counts[order(counts$Sample),] | |
| 182 | |
| 183 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) | |
| 184 | 174 |
| 185 # ---------------------- Frequency calculation for V, D and J ---------------------- | 175 # ---------------------- Frequency calculation for V, D and J ---------------------- |
| 186 | 176 |
| 187 print("Report Clonality - frequency calculation V, D and J") | 177 print("Report Clonality - frequency calculation V, D and J") |
| 188 | 178 |
| 379 print("Report Clonality - Heatmaps VD") | 369 print("Report Clonality - Heatmaps VD") |
| 380 plotVD <- function(dat){ | 370 plotVD <- function(dat){ |
| 381 if(length(dat[,1]) == 0){ | 371 if(length(dat[,1]) == 0){ |
| 382 return() | 372 return() |
| 383 } | 373 } |
| 374 | |
| 384 img = ggplot() + | 375 img = ggplot() + |
| 385 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + | 376 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + |
| 386 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 377 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
| 387 scale_fill_gradient(low="gold", high="blue", na.value="white") + | 378 scale_fill_gradient(low="gold", high="blue", na.value="white") + |
| 388 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + | 379 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + |
| 400 VandDCount$l = log(VandDCount$Length) | 391 VandDCount$l = log(VandDCount$Length) |
| 401 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) | 392 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) |
| 402 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) | 393 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) |
| 403 VandDCount$relLength = VandDCount$l / VandDCount$max | 394 VandDCount$relLength = VandDCount$l / VandDCount$max |
| 404 | 395 |
| 405 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample)) | 396 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name) |
| 406 | 397 |
| 407 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) | 398 completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE) |
| 399 | |
| 408 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 400 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
| 401 | |
| 409 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 402 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) |
| 410 | 403 |
| 411 fltr = is.nan(completeVD$relLength) | 404 fltr = is.nan(completeVD$relLength) |
| 412 if(any(fltr)){ | 405 if(all(fltr)){ |
| 413 completeVD[fltr,"relLength"] = 1 | 406 completeVD[fltr,"relLength"] = 0 |
| 414 } | 407 } |
| 415 | 408 |
| 416 VDList = split(completeVD, f=completeVD[,"Sample"]) | 409 VDList = split(completeVD, f=completeVD[,"Sample"]) |
| 417 lapply(VDList, FUN=plotVD) | 410 lapply(VDList, FUN=plotVD) |
| 418 } | 411 } |
| 443 VandJCount$l = log(VandJCount$Length) | 436 VandJCount$l = log(VandJCount$Length) |
| 444 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) | 437 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) |
| 445 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) | 438 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) |
| 446 VandJCount$relLength = VandJCount$l / VandJCount$max | 439 VandJCount$relLength = VandJCount$l / VandJCount$max |
| 447 | 440 |
| 448 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) | 441 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name) |
| 449 | 442 |
| 450 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) | 443 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) |
| 451 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 444 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
| 452 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 445 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) |
| 453 | 446 |
| 487 DandJCount$l = log(DandJCount$Length) | 480 DandJCount$l = log(DandJCount$Length) |
| 488 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) | 481 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) |
| 489 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) | 482 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) |
| 490 DandJCount$relLength = DandJCount$l / DandJCount$max | 483 DandJCount$relLength = DandJCount$l / DandJCount$max |
| 491 | 484 |
| 492 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) | 485 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name) |
| 493 | 486 |
| 494 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) | 487 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) |
| 495 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 488 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) |
| 496 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 489 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) |
| 497 | 490 |
