Mercurial > repos > davidvanzessen > report_clonality_igg
comparison RScript.r @ 21:2424111b9198 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 12 Jan 2015 11:06:49 -0500 |
| parents | b79ad381ced0 |
| children | 2555b94dbdb2 |
comparison
equal
deleted
inserted
replaced
| 20:b79ad381ced0 | 21:2424111b9198 |
|---|---|
| 29 | 29 |
| 30 infile = args[1] #path to input file | 30 infile = args[1] #path to input file |
| 31 outfile = args[2] #path to output file | 31 outfile = args[2] #path to output file |
| 32 outdir = args[3] #path to output folder (html/images/data) | 32 outdir = args[3] #path to output folder (html/images/data) |
| 33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering | 33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering |
| 34 ct = unlist(strsplit(clonaltype, ",")) | |
| 34 species = args[5] #human or mouse | 35 species = args[5] #human or mouse |
| 35 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD | 36 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD |
| 36 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) | 37 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) |
| 37 | 38 |
| 38 # ---------------------- Data preperation ---------------------- | 39 # ---------------------- Data preperation ---------------------- |
| 40 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") | 41 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") |
| 41 | 42 |
| 42 setwd(outdir) | 43 setwd(outdir) |
| 43 | 44 |
| 44 # remove weird rows | 45 # remove weird rows |
| 45 inputdata = inputdata[inputdata$Sample != "",] | 46 inputdata = inputdata[inputdata$Sample != "",] |
| 46 | 47 |
| 47 #remove the allele from the V,D and J genes | 48 #remove the allele from the V,D and J genes |
| 48 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) | 49 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) |
| 49 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) | 50 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) |
| 50 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) | 51 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) |
| 52 | |
| 51 inputdata$clonaltype = 1:nrow(inputdata) | 53 inputdata$clonaltype = 1:nrow(inputdata) |
| 54 | |
| 52 PRODF = inputdata | 55 PRODF = inputdata |
| 53 UNPROD = inputdata | 56 UNPROD = inputdata |
| 54 if(filterproductive){ | 57 if(filterproductive){ |
| 55 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column | 58 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column |
| 56 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] | 59 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] |
| 61 } | 64 } |
| 62 } | 65 } |
| 63 | 66 |
| 64 #remove duplicates based on the clonaltype | 67 #remove duplicates based on the clonaltype |
| 65 if(clonaltype != "none"){ | 68 if(clonaltype != "none"){ |
| 69 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples | |
| 66 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) | 70 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) |
| 67 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] | 71 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] |
| 68 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) | |
| 69 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] | |
| 70 } | 72 } |
| 71 | 73 |
| 72 PRODF$freq = 1 | 74 PRODF$freq = 1 |
| 73 | 75 |
| 74 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" | 76 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" |
| 94 | 96 |
| 95 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 97 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- |
| 96 | 98 |
| 97 inputdata.dt = data.table(inputdata) #for speed | 99 inputdata.dt = data.table(inputdata) #for speed |
| 98 | 100 |
| 99 ct = unlist(strsplit(clonaltype, ",")) | |
| 100 if(clonaltype == "none"){ | 101 if(clonaltype == "none"){ |
| 101 ct = c("ID") | 102 ct = c("clonaltype") |
| 102 } | 103 } |
| 103 | 104 |
| 104 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") | 105 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") |
| 105 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) | 106 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) |
| 106 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) | 107 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) |
| 469 | 470 |
| 470 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available | 471 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available |
| 471 { | 472 { |
| 472 clonalityFrame = inputdata | 473 clonalityFrame = inputdata |
| 473 if(clonaltype != "none"){ | 474 if(clonaltype != "none"){ |
| 475 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) | |
| 474 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":") | 476 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":") |
| 475 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] | 477 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] |
| 476 } | 478 } |
| 477 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) | 479 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 478 | 480 |
| 513 } | 515 } |
| 514 | 516 |
| 515 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) | 517 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) |
| 516 lapply(ReplicateSplit, FUN=ReplicatePrint) | 518 lapply(ReplicateSplit, FUN=ReplicatePrint) |
| 517 | 519 |
| 518 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) | 520 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")]) |
| 519 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) | 521 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) |
| 520 | 522 |
| 521 | 523 |
| 522 ReplicateSumPrint <- function(dat){ | 524 ReplicateSumPrint <- function(dat){ |
| 523 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) | 525 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) |
| 556 | 558 |
| 557 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") | 559 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") |
| 558 if(all(imgtcolumns %in% colnames(inputdata))) | 560 if(all(imgtcolumns %in% colnames(inputdata))) |
| 559 { | 561 { |
| 560 newData = data.frame(data.table(PRODF)[,list(unique=.N, | 562 newData = data.frame(data.table(PRODF)[,list(unique=.N, |
| 561 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), | 563 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), |
| 562 P1=mean(P3V.nt.nb, na.rm=T), | 564 P1=mean(P3V.nt.nb, na.rm=T), |
| 563 N1=mean(N1.REGION.nt.nb, na.rm=T), | 565 N1=mean(N1.REGION.nt.nb, na.rm=T), |
| 564 P2=mean(P5D.nt.nb, na.rm=T), | 566 P2=mean(P5D.nt.nb, na.rm=T), |
| 565 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), | 567 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), |
| 566 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), | 568 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), |
| 567 P3=mean(P3D.nt.nb, na.rm=T), | 569 P3=mean(P3D.nt.nb, na.rm=T), |
| 568 N2=mean(N2.REGION.nt.nb, na.rm=T), | 570 N2=mean(N2.REGION.nt.nb, na.rm=T), |
| 569 P4=mean(P5J.nt.nb, na.rm=T), | 571 P4=mean(P5J.nt.nb, na.rm=T), |
| 570 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), | 572 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), |
| 571 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + | 573 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + |
| 572 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + | 574 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + |
| 573 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + | 575 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + |
| 574 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), | 576 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), |
| 575 | 577 |
| 576 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + | 578 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + |
| 577 mean(N2.REGION.nt.nb, na.rm=T)), | 579 mean(N2.REGION.nt.nb, na.rm=T)), |
| 578 | 580 |
| 579 Total.P=( mean(P3V.nt.nb, na.rm=T) + | 581 Total.P=( mean(P3V.nt.nb, na.rm=T) + |
| 580 mean(P5D.nt.nb, na.rm=T) + | 582 mean(P5D.nt.nb, na.rm=T) + |
| 581 mean(P3D.nt.nb, na.rm=T) + | 583 mean(P3D.nt.nb, na.rm=T) + |
| 582 mean(P5J.nt.nb, na.rm=T))), | 584 mean(P5J.nt.nb, na.rm=T))), |
| 583 by=c("Sample")]) | 585 by=c("Sample")]) |
| 584 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | 586 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) |
| 585 | 587 |
| 586 newData = data.frame(data.table(UNPROD)[,list(unique=.N, | 588 newData = data.frame(data.table(UNPROD)[,list(unique=.N, |
| 587 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), | 589 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), |
| 588 P1=mean(P3V.nt.nb, na.rm=T), | 590 P1=mean(P3V.nt.nb, na.rm=T), |
| 589 N1=mean(N1.REGION.nt.nb, na.rm=T), | 591 N1=mean(N1.REGION.nt.nb, na.rm=T), |
| 590 P2=mean(P5D.nt.nb, na.rm=T), | 592 P2=mean(P5D.nt.nb, na.rm=T), |
| 591 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), | 593 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), |
| 592 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), | 594 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), |
| 593 P3=mean(P3D.nt.nb, na.rm=T), | 595 P3=mean(P3D.nt.nb, na.rm=T), |
| 594 N2=mean(N2.REGION.nt.nb, na.rm=T), | 596 N2=mean(N2.REGION.nt.nb, na.rm=T), |
| 595 P4=mean(P5J.nt.nb, na.rm=T), | 597 P4=mean(P5J.nt.nb, na.rm=T), |
| 596 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), | 598 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), |
| 597 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + | 599 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + |
| 598 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + | 600 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + |
| 599 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + | 601 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + |
| 600 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), | 602 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), |
| 601 | 603 |
| 602 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + | 604 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + |
| 603 mean(N2.REGION.nt.nb, na.rm=T)), | 605 mean(N2.REGION.nt.nb, na.rm=T)), |
| 604 | 606 |
| 605 Total.P=( mean(P3V.nt.nb, na.rm=T) + | 607 Total.P=( mean(P3V.nt.nb, na.rm=T) + |
| 606 mean(P5D.nt.nb, na.rm=T) + | 608 mean(P5D.nt.nb, na.rm=T) + |
| 607 mean(P3D.nt.nb, na.rm=T) + | 609 mean(P3D.nt.nb, na.rm=T) + |
| 608 mean(P5J.nt.nb, na.rm=T))), | 610 mean(P5J.nt.nb, na.rm=T))), |
| 609 by=c("Sample")]) | 611 by=c("Sample")]) |
| 610 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | 612 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) |
| 611 } | 613 } |
