comparison RScript.r @ 32:548b3035bbd7 draft

Uploaded
author davidvanzessen
date Fri, 08 Nov 2013 05:29:29 -0500
parents 1c25c75d863d
children 8d9be90792b5
comparison
equal deleted inserted replaced
31:d26e8c208b5f 32:548b3035bbd7
2 2
3 args <- commandArgs(trailingOnly = TRUE) 3 args <- commandArgs(trailingOnly = TRUE)
4 4
5 inFile = args[1] 5 inFile = args[1]
6 outFile = args[2] 6 outFile = args[2]
7 7 outDir = args[3]
8 #install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") 8
9 library (gridExtra) 9 if (!require("gridExtra")) {
10 #install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 10 install.packages("gridExtra", repos="http://cran.xl-mirror.nl/")
11 }
12 library(gridExtra)
13 if (!require("ggplot2")) {
14 install.packages("ggplot2", repos="http://cran.xl-mirror.nl/")
15 }
11 require(ggplot2) 16 require(ggplot2)
12 #install.packages("plyr", repos="http://cran.xl-mirror.nl/") 17 if (!require("plyr")) {
18 install.packages("plyr", repos="http://cran.xl-mirror.nl/")
19 }
13 require(plyr) 20 require(plyr)
14 21
15 test = read.table(inFile, sep="\t", header=TRUE) 22 test = read.table(inFile, sep="\t", header=TRUE)
16 23
17 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) 24 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
24 31
25 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] 32 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ]
26 33
27 PRODF = PROD[ -1] 34 PRODF = PROD[ -1]
28 35
29 #unique(PRODF[duplicated(PRODF),]) 36 #PRODF = unique(PRODF)
30 #length(row.names(PRODF[duplicated(PRODF),])) 37 PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ]
31 38
32 #length(row.names(PRODF)) 39
33 PRODF = unique(PRODF) 40 uniqueCount = split(PRODF, f=PRODF[,"Sample"])
34 #length(row.names(PRODF)) 41
42 for(i in 1:length(uniqueCount)) {
43 dat = data.frame(uniqueCount[i])
44 sample = paste(unique(dat[,15]))
45 uniqueCount[sample] = length(dat[,1])
46 }
35 47
36 PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3)) 48 PRODFV = ddply(PRODF, c("Sample", "Top.V.Gene"), function(x) summary(x$VDJCDR3))
37 PRODFV$Length = as.numeric(PRODFV$Length) 49 PRODFV$Length = as.numeric(PRODFV$Length)
38 Total = 0 50 Total = 0
39 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) 51 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length)))
71 tcJ = textConnection(J) 83 tcJ = textConnection(J)
72 Jchain = read.table(tcJ, sep="\t", header=TRUE) 84 Jchain = read.table(tcJ, sep="\t", header=TRUE)
73 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) 85 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE)
74 close(tcJ) 86 close(tcJ)
75 87
88 setwd(outDir)
89
76 pV = ggplot(PRODFV) 90 pV = ggplot(PRODFV)
77 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 91 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
78 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") 92 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage")
79 #pV 93
94 png("VPlot.png",width = 1280, height = 720)
95 pV
96 dev.off();
80 97
81 pD = ggplot(PRODFD) 98 pD = ggplot(PRODFD)
82 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 99 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
83 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") 100 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage")
84 #pD 101
102 png("DPlot.png",width = 800, height = 600)
103 pD
104 dev.off();
85 105
86 pJ = ggplot(PRODFJ) 106 pJ = ggplot(PRODFJ)
87 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 107 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
88 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") 108 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage")
89 #pJ 109
90 110 png("JPlot.png",width = 800, height = 600)
91 #plotall = grid.arrange(pV, arrangeGrob(pD, pJ, ncol=2), ncol=1, widths=c(1,1.2)) 111 pJ
92 #plotall 112 dev.off();
93 #ggsave(outFile, dpi=125) 113
94 114
95 115 plotVD <- function(dat){
96 png(outFile,width = 1920, height = 1200) 116 img = ggplot() +
97 plotall = grid.arrange(pV, arrangeGrob(pD, pJ, ncol=2), ncol=1, widths=c(1,1.2)) 117 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) +
98 dev.off() 118 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
99 119 scale_fill_gradient(low="gold", high="blue", na.value="white") +
100 120 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) +
121 xlab("D genes") +
122 ylab("V Genes")
123
124 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
125 print(img)
126 dev.off()
127 }
128
129
130 VandDCount = ddply(PRODF, c("Top.V.Gene", "Top.D.Gene", "Sample"), function(x) summary(x$VDJCDR3))
131 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample))
132
133 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
134 completeVD$Length = as.numeric(completeVD$Length)
135 completeVD$log = log(completeVD$Length)
136 completeVD = merge(completeVD, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
137 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
138 #completeVD$log[is.na(completeVD$log)] = 0
139 l = split(completeVD, f=completeVD[,"Sample"])
140
141 lapply(l, FUN=plotVD)
142
143
144
145 plotVJ <- function(dat){
146 img = ggplot() +
147 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=log)) +
148 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
149 scale_fill_gradient(low="gold", high="blue", na.value="white") +
150 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) +
151 xlab("J genes") +
152 ylab("V Genes")
153
154 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
155 print(img)
156 dev.off()
157 }
158
159 VandJCount = ddply(PRODF, c("Top.V.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
160 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
161
162 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
163 completeVJ$Length = as.numeric(completeVJ$Length)
164 completeVJ$log = log(completeVJ$Length)
165 completeVJ = merge(completeVJ, Vchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
166 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
167 #completeVJ$log[is.na(completeVJ$log)] = 0
168 l = split(completeVJ, f=completeVJ[,"Sample"])
169 lapply(l, FUN=plotVJ)
170
171 plotDJ <- function(dat){
172 img = ggplot() +
173 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=log)) +
174 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
175 scale_fill_gradient(low="gold", high="blue", na.value="white") +
176 ggtitle(paste(unique(dat$Sample), " (N=" , uniqueCount[paste(unique(dat$Sample))] ,")", sep="")) +
177 xlab("J genes") +
178 ylab("D Genes")
179
180 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
181 print(img)
182 dev.off()
183 }
184
185 DandJCount = ddply(PRODF, c("Top.D.Gene", "Top.J.Gene", "Sample"), function(x) summary(x$VDJCDR3))
186 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample))
187
188 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
189 completeDJ$Length = as.numeric(completeDJ$Length)
190 completeDJ$log = log(completeDJ$Length)
191 completeDJ = merge(completeDJ, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
192 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
193 #completeDJ$log[is.na(completeDJ$log)] = 0
194 l = split(completeDJ, f=completeDJ[,"Sample"])
195 lapply(l, FUN=plotDJ)
196
197
198 sampleFile <- file("samples.txt")
199 un = unique(test$Sample)
200 un = paste(un, sep="\n")
201 writeLines(un, sampleFile)
202 close(sampleFile)