comparison RScript.r @ 10:b8db36cfe6ad draft

Uploaded
author davidvanzessen
date Mon, 12 Jan 2015 11:07:58 -0500
parents 7dbc9ebcefce
children
comparison
equal deleted inserted replaced
9:7dbc9ebcefce 10:b8db36cfe6ad
29 29
30 infile = args[1] #path to input file 30 infile = args[1] #path to input file
31 outfile = args[2] #path to output file 31 outfile = args[2] #path to output file
32 outdir = args[3] #path to output folder (html/images/data) 32 outdir = args[3] #path to output folder (html/images/data)
33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering 33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering
34 ct = unlist(strsplit(clonaltype, ","))
34 species = args[5] #human or mouse 35 species = args[5] #human or mouse
35 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD 36 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD
36 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) 37 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no)
37 38
38 # ---------------------- Data preperation ---------------------- 39 # ---------------------- Data preperation ----------------------
40 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") 41 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="")
41 42
42 setwd(outdir) 43 setwd(outdir)
43 44
44 # remove weird rows 45 # remove weird rows
45 inputdata = inputdata[inputdata$Sample != "",] 46 inputdata = inputdata[inputdata$Sample != "",]
46 47
47 #remove the allele from the V,D and J genes 48 #remove the allele from the V,D and J genes
48 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) 49 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene)
49 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) 50 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene)
50 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) 51 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene)
52
51 inputdata$clonaltype = 1:nrow(inputdata) 53 inputdata$clonaltype = 1:nrow(inputdata)
54
52 PRODF = inputdata 55 PRODF = inputdata
53 UNPROD = inputdata 56 UNPROD = inputdata
54 if(filterproductive){ 57 if(filterproductive){
55 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column 58 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column
56 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] 59 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
61 } 64 }
62 } 65 }
63 66
64 #remove duplicates based on the clonaltype 67 #remove duplicates based on the clonaltype
65 if(clonaltype != "none"){ 68 if(clonaltype != "none"){
69 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples
66 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) 70 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":"))
67 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] 71 PRODF = PRODF[!duplicated(PRODF$clonaltype), ]
68 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":"))
69 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ]
70 } 72 }
71 73
72 PRODF$freq = 1 74 PRODF$freq = 1
73 75
74 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" 76 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*"
94 96
95 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- 97 # ---------------------- Counting the productive/unproductive and unique sequences ----------------------
96 98
97 inputdata.dt = data.table(inputdata) #for speed 99 inputdata.dt = data.table(inputdata) #for speed
98 100
99 ct = unlist(strsplit(clonaltype, ","))
100 if(clonaltype == "none"){ 101 if(clonaltype == "none"){
101 ct = c("ID") 102 ct = c("clonaltype")
102 } 103 }
103 104
104 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") 105 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_")
105 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) 106 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample)))
106 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) 107 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)])
469 470
470 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available 471 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available
471 { 472 {
472 clonalityFrame = inputdata 473 clonalityFrame = inputdata
473 if(clonaltype != "none"){ 474 if(clonaltype != "none"){
475 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":"))
474 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":") 476 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":")
475 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] 477 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ]
476 } 478 }
477 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) 479 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T)
478 480
513 } 515 }
514 516
515 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) 517 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
516 lapply(ReplicateSplit, FUN=ReplicatePrint) 518 lapply(ReplicateSplit, FUN=ReplicatePrint)
517 519
518 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) 520 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")])
519 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) 521 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T)
520 522
521 523
522 ReplicateSumPrint <- function(dat){ 524 ReplicateSumPrint <- function(dat){
523 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) 525 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
556 558
557 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") 559 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb")
558 if(all(imgtcolumns %in% colnames(inputdata))) 560 if(all(imgtcolumns %in% colnames(inputdata)))
559 { 561 {
560 newData = data.frame(data.table(PRODF)[,list(unique=.N, 562 newData = data.frame(data.table(PRODF)[,list(unique=.N,
561 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), 563 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T),
562 P1=mean(P3V.nt.nb, na.rm=T), 564 P1=mean(P3V.nt.nb, na.rm=T),
563 N1=mean(N1.REGION.nt.nb, na.rm=T), 565 N1=mean(N1.REGION.nt.nb, na.rm=T),
564 P2=mean(P5D.nt.nb, na.rm=T), 566 P2=mean(P5D.nt.nb, na.rm=T),
565 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), 567 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T),
566 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), 568 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T),
567 P3=mean(P3D.nt.nb, na.rm=T), 569 P3=mean(P3D.nt.nb, na.rm=T),
568 N2=mean(N2.REGION.nt.nb, na.rm=T), 570 N2=mean(N2.REGION.nt.nb, na.rm=T),
569 P4=mean(P5J.nt.nb, na.rm=T), 571 P4=mean(P5J.nt.nb, na.rm=T),
570 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), 572 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T),
571 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + 573 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) +
572 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + 574 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) +
573 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + 575 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) +
574 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), 576 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)),
575 577
576 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + 578 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) +
577 mean(N2.REGION.nt.nb, na.rm=T)), 579 mean(N2.REGION.nt.nb, na.rm=T)),
578 580
579 Total.P=( mean(P3V.nt.nb, na.rm=T) + 581 Total.P=( mean(P3V.nt.nb, na.rm=T) +
580 mean(P5D.nt.nb, na.rm=T) + 582 mean(P5D.nt.nb, na.rm=T) +
581 mean(P3D.nt.nb, na.rm=T) + 583 mean(P3D.nt.nb, na.rm=T) +
582 mean(P5J.nt.nb, na.rm=T))), 584 mean(P5J.nt.nb, na.rm=T))),
583 by=c("Sample")]) 585 by=c("Sample")])
584 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 586 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
585 587
586 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 588 newData = data.frame(data.table(UNPROD)[,list(unique=.N,
587 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), 589 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T),
588 P1=mean(P3V.nt.nb, na.rm=T), 590 P1=mean(P3V.nt.nb, na.rm=T),
589 N1=mean(N1.REGION.nt.nb, na.rm=T), 591 N1=mean(N1.REGION.nt.nb, na.rm=T),
590 P2=mean(P5D.nt.nb, na.rm=T), 592 P2=mean(P5D.nt.nb, na.rm=T),
591 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), 593 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T),
592 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), 594 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T),
593 P3=mean(P3D.nt.nb, na.rm=T), 595 P3=mean(P3D.nt.nb, na.rm=T),
594 N2=mean(N2.REGION.nt.nb, na.rm=T), 596 N2=mean(N2.REGION.nt.nb, na.rm=T),
595 P4=mean(P5J.nt.nb, na.rm=T), 597 P4=mean(P5J.nt.nb, na.rm=T),
596 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), 598 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T),
597 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + 599 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) +
598 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + 600 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) +
599 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + 601 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) +
600 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), 602 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)),
601 603
602 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + 604 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) +
603 mean(N2.REGION.nt.nb, na.rm=T)), 605 mean(N2.REGION.nt.nb, na.rm=T)),
604 606
605 Total.P=( mean(P3V.nt.nb, na.rm=T) + 607 Total.P=( mean(P3V.nt.nb, na.rm=T) +
606 mean(P5D.nt.nb, na.rm=T) + 608 mean(P5D.nt.nb, na.rm=T) +
607 mean(P3D.nt.nb, na.rm=T) + 609 mean(P3D.nt.nb, na.rm=T) +
608 mean(P5J.nt.nb, na.rm=T))), 610 mean(P5J.nt.nb, na.rm=T))),
609 by=c("Sample")]) 611 by=c("Sample")])
610 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 612 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
611 } 613 }