Mercurial > repos > davidvanzessen > complete_immunerepertoire_igg
comparison RScript.r @ 10:b8db36cfe6ad draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 12 Jan 2015 11:07:58 -0500 |
parents | 7dbc9ebcefce |
children |
comparison
equal
deleted
inserted
replaced
9:7dbc9ebcefce | 10:b8db36cfe6ad |
---|---|
29 | 29 |
30 infile = args[1] #path to input file | 30 infile = args[1] #path to input file |
31 outfile = args[2] #path to output file | 31 outfile = args[2] #path to output file |
32 outdir = args[3] #path to output folder (html/images/data) | 32 outdir = args[3] #path to output folder (html/images/data) |
33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering | 33 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering |
34 ct = unlist(strsplit(clonaltype, ",")) | |
34 species = args[5] #human or mouse | 35 species = args[5] #human or mouse |
35 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD | 36 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD |
36 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) | 37 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) |
37 | 38 |
38 # ---------------------- Data preperation ---------------------- | 39 # ---------------------- Data preperation ---------------------- |
40 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") | 41 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") |
41 | 42 |
42 setwd(outdir) | 43 setwd(outdir) |
43 | 44 |
44 # remove weird rows | 45 # remove weird rows |
45 inputdata = inputdata[inputdata$Sample != "",] | 46 inputdata = inputdata[inputdata$Sample != "",] |
46 | 47 |
47 #remove the allele from the V,D and J genes | 48 #remove the allele from the V,D and J genes |
48 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) | 49 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) |
49 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) | 50 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) |
50 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) | 51 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) |
52 | |
51 inputdata$clonaltype = 1:nrow(inputdata) | 53 inputdata$clonaltype = 1:nrow(inputdata) |
54 | |
52 PRODF = inputdata | 55 PRODF = inputdata |
53 UNPROD = inputdata | 56 UNPROD = inputdata |
54 if(filterproductive){ | 57 if(filterproductive){ |
55 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column | 58 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column |
56 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] | 59 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] |
61 } | 64 } |
62 } | 65 } |
63 | 66 |
64 #remove duplicates based on the clonaltype | 67 #remove duplicates based on the clonaltype |
65 if(clonaltype != "none"){ | 68 if(clonaltype != "none"){ |
69 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples | |
66 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) | 70 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) |
67 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] | 71 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] |
68 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) | |
69 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] | |
70 } | 72 } |
71 | 73 |
72 PRODF$freq = 1 | 74 PRODF$freq = 1 |
73 | 75 |
74 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" | 76 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" |
94 | 96 |
95 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 97 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- |
96 | 98 |
97 inputdata.dt = data.table(inputdata) #for speed | 99 inputdata.dt = data.table(inputdata) #for speed |
98 | 100 |
99 ct = unlist(strsplit(clonaltype, ",")) | |
100 if(clonaltype == "none"){ | 101 if(clonaltype == "none"){ |
101 ct = c("ID") | 102 ct = c("clonaltype") |
102 } | 103 } |
103 | 104 |
104 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") | 105 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") |
105 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) | 106 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) |
106 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) | 107 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) |
469 | 470 |
470 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available | 471 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available |
471 { | 472 { |
472 clonalityFrame = inputdata | 473 clonalityFrame = inputdata |
473 if(clonaltype != "none"){ | 474 if(clonaltype != "none"){ |
475 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) | |
474 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":") | 476 clonalityFrame$ReplicateConcat = paste(clonalityFrame$clonaltype, clonalityFrame$Sample, clonalityFrame$Replicate, sep = ":") |
475 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] | 477 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] |
476 } | 478 } |
477 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) | 479 write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) |
478 | 480 |
513 } | 515 } |
514 | 516 |
515 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) | 517 ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) |
516 lapply(ReplicateSplit, FUN=ReplicatePrint) | 518 lapply(ReplicateSplit, FUN=ReplicatePrint) |
517 | 519 |
518 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) | 520 ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")]) |
519 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) | 521 clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) |
520 | 522 |
521 | 523 |
522 ReplicateSumPrint <- function(dat){ | 524 ReplicateSumPrint <- function(dat){ |
523 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) | 525 write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) |
556 | 558 |
557 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") | 559 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") |
558 if(all(imgtcolumns %in% colnames(inputdata))) | 560 if(all(imgtcolumns %in% colnames(inputdata))) |
559 { | 561 { |
560 newData = data.frame(data.table(PRODF)[,list(unique=.N, | 562 newData = data.frame(data.table(PRODF)[,list(unique=.N, |
561 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), | 563 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), |
562 P1=mean(P3V.nt.nb, na.rm=T), | 564 P1=mean(P3V.nt.nb, na.rm=T), |
563 N1=mean(N1.REGION.nt.nb, na.rm=T), | 565 N1=mean(N1.REGION.nt.nb, na.rm=T), |
564 P2=mean(P5D.nt.nb, na.rm=T), | 566 P2=mean(P5D.nt.nb, na.rm=T), |
565 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), | 567 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), |
566 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), | 568 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), |
567 P3=mean(P3D.nt.nb, na.rm=T), | 569 P3=mean(P3D.nt.nb, na.rm=T), |
568 N2=mean(N2.REGION.nt.nb, na.rm=T), | 570 N2=mean(N2.REGION.nt.nb, na.rm=T), |
569 P4=mean(P5J.nt.nb, na.rm=T), | 571 P4=mean(P5J.nt.nb, na.rm=T), |
570 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), | 572 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), |
571 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + | 573 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + |
572 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + | 574 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + |
573 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + | 575 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + |
574 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), | 576 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), |
575 | 577 |
576 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + | 578 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + |
577 mean(N2.REGION.nt.nb, na.rm=T)), | 579 mean(N2.REGION.nt.nb, na.rm=T)), |
578 | 580 |
579 Total.P=( mean(P3V.nt.nb, na.rm=T) + | 581 Total.P=( mean(P3V.nt.nb, na.rm=T) + |
580 mean(P5D.nt.nb, na.rm=T) + | 582 mean(P5D.nt.nb, na.rm=T) + |
581 mean(P3D.nt.nb, na.rm=T) + | 583 mean(P3D.nt.nb, na.rm=T) + |
582 mean(P5J.nt.nb, na.rm=T))), | 584 mean(P5J.nt.nb, na.rm=T))), |
583 by=c("Sample")]) | 585 by=c("Sample")]) |
584 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | 586 write.table(newData, "junctionAnalysisProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) |
585 | 587 |
586 newData = data.frame(data.table(UNPROD)[,list(unique=.N, | 588 newData = data.frame(data.table(UNPROD)[,list(unique=.N, |
587 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), | 589 VH.DEL=mean(X3V.REGION.trimmed.nt.nb, na.rm=T), |
588 P1=mean(P3V.nt.nb, na.rm=T), | 590 P1=mean(P3V.nt.nb, na.rm=T), |
589 N1=mean(N1.REGION.nt.nb, na.rm=T), | 591 N1=mean(N1.REGION.nt.nb, na.rm=T), |
590 P2=mean(P5D.nt.nb, na.rm=T), | 592 P2=mean(P5D.nt.nb, na.rm=T), |
591 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), | 593 DEL.DH=mean(X5D.REGION.trimmed.nt.nb, na.rm=T), |
592 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), | 594 DH.DEL=mean(X3D.REGION.trimmed.nt.nb, na.rm=T), |
593 P3=mean(P3D.nt.nb, na.rm=T), | 595 P3=mean(P3D.nt.nb, na.rm=T), |
594 N2=mean(N2.REGION.nt.nb, na.rm=T), | 596 N2=mean(N2.REGION.nt.nb, na.rm=T), |
595 P4=mean(P5J.nt.nb, na.rm=T), | 597 P4=mean(P5J.nt.nb, na.rm=T), |
596 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), | 598 DEL.JH=mean(X5J.REGION.trimmed.nt.nb, na.rm=T), |
597 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + | 599 Total.Del=( mean(X3V.REGION.trimmed.nt.nb, na.rm=T) + |
598 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + | 600 mean(X5D.REGION.trimmed.nt.nb, na.rm=T) + |
599 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + | 601 mean(X3D.REGION.trimmed.nt.nb, na.rm=T) + |
600 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), | 602 mean(X5J.REGION.trimmed.nt.nb, na.rm=T)), |
601 | 603 |
602 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + | 604 Total.N=( mean(N1.REGION.nt.nb, na.rm=T) + |
603 mean(N2.REGION.nt.nb, na.rm=T)), | 605 mean(N2.REGION.nt.nb, na.rm=T)), |
604 | 606 |
605 Total.P=( mean(P3V.nt.nb, na.rm=T) + | 607 Total.P=( mean(P3V.nt.nb, na.rm=T) + |
606 mean(P5D.nt.nb, na.rm=T) + | 608 mean(P5D.nt.nb, na.rm=T) + |
607 mean(P3D.nt.nb, na.rm=T) + | 609 mean(P3D.nt.nb, na.rm=T) + |
608 mean(P5J.nt.nb, na.rm=T))), | 610 mean(P5J.nt.nb, na.rm=T))), |
609 by=c("Sample")]) | 611 by=c("Sample")]) |
610 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | 612 write.table(newData, "junctionAnalysisUnProd.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) |
611 } | 613 } |